filter-repo: group repo analysis functions into a class

Signed-off-by: Elijah Newren <newren@gmail.com>
2024-07-06 18:32:14 +02:00 · 2018-12-25 21:54:16 -08:00 · 2018-12-25 21:54:16 -08:00 · 4e2110136e
commit 4e2110136e
parent 9887dd5cbe
1 changed files with 376 additions and 358 deletions
--- a/734
+++ b/734
@ -1891,10 +1891,15 @@ class FilteringOptions(object):
    FilteringOptions.sanity_check_args(args)
    return args
-def analyze_commit(stats, graph, commit, parents, date, file_changes):
+class RepoAnalyze(object):
-  def equiv_class(filename):
+
  # First, several helper functions for analyze_commit()
  @staticmethod
  def equiv_class(stats, filename):
    return stats['equivalence'].get(filename, (filename,))
  @staticmethod
  def setup_equivalence_for_rename(stats, oldname, newname):
    # if A is renamed to B and B is renamed to C, then the user thinks of
    # A, B, and C as all being different names for the same 'file'.  We record
@ -1911,18 +1916,22 @@ def analyze_commit(stats, graph, commit, parents, date, file_changes):
    for f in new_tuple:
      stats['equivalence'][f] = new_tuple
  @staticmethod
  def setup_or_update_rename_history(stats, commit, oldname, newname):
    rename_commits = stats['rename_history'].get(oldname, set())
    rename_commits.add(commit)
    stats['rename_history'][oldname] = rename_commits
  @staticmethod
  def handle_renames(stats, commit, change_types, filenames):
    for index, change_type in enumerate(change_types):
      if change_type == 'R':
        oldname, newname = filenames[index], filenames[-1]
-        setup_equivalence_for_rename(stats, oldname, newname)
+        RepoAnalyze.setup_equivalence_for_rename(stats, oldname, newname)
-        setup_or_update_rename_history(stats, commit, oldname, newname)
+        RepoAnalyze.setup_or_update_rename_history(stats, commit,
                                                   oldname, newname)
  @staticmethod
  def handle_file(stats, graph, commit, modes, shas, filenames):
    mode, sha, filename = modes[-1], shas[-1], filenames[-1]
@ -1936,7 +1945,7 @@ def analyze_commit(stats, graph, commit, parents, date, file_changes):
    # If the file (or equivalence class of files) was recorded as deleted,
    # clearly it isn't anymore
-    equiv = equiv_class(filename)
+    equiv = RepoAnalyze.equiv_class(stats, filename)
    for f in equiv:
      stats[delmode].pop(f, None)
@ -1954,383 +1963,392 @@ def analyze_commit(stats, graph, commit, parents, date, file_changes):
        if f in stats['equivalence']:
          del stats['equivalence'][f]
-  graph.add_commit_and_parents(commit, parents)
+  @staticmethod
-  for change in file_changes:
+  def analyze_commit(stats, graph, commit, parents, date, file_changes):
-    modes, shas, change_types, filenames = change
+    graph.add_commit_and_parents(commit, parents)
-    if len(parents) == 1 and change_types.startswith('R'):
+    for change in file_changes:
-      change_types = 'R'  # remove the rename score; we don't care
+      modes, shas, change_types, filenames = change
-    if modes[-1] == '160000':
+      if len(parents) == 1 and change_types.startswith('R'):
-      continue
+        change_types = 'R'  # remove the rename score; we don't care
-    elif modes[-1] == '000000':
+      if modes[-1] == '160000':
      # Track when files/directories are deleted; see 'R' below about equiv_class
      for f in equiv_class(filenames[-1]):
        if any(x == '040000' for x in modes[0:-1]):
          stats['tree_deletions'][f] = date
        else:
          stats['file_deletions'][f] = date
    elif change_types.strip('AMT') == '':
      handle_file(stats, graph, commit, modes, shas, filenames)
    elif modes[-1] == '040000' and change_types.strip('RAM') == '':
      handle_file(stats, graph, commit, modes, shas, filenames)
    elif change_types.strip('RAM') == '':
      handle_file(stats, graph, commit, modes, shas, filenames)
      handle_renames(stats, commit, change_types, filenames)
    else:
      raise SystemExit("Unhandled change type(s): {} (in commit {})"
                       .format(change_types, commit))
 def gather_data(args):
  blob_size_progress = ProgressWriter()
  num_blobs = 0
  # Get sizes of blobs by sha1
  a='--batch-check=%(objectname) %(objecttype) %(objectsize) %(objectsize:disk)'
  cf = subprocess.Popen(['git', 'cat-file', '--batch-all-objects', a],
                        bufsize = -1,
                        stdout = subprocess.PIPE)
  unpacked_size = {}
  packed_size = {}
  for line in cf.stdout:
    sha, objtype, objsize, objdisksize = line.split()
    objsize, objdisksize = int(objsize), int(objdisksize)
    if objtype == 'blob':
      unpacked_size[sha] = objsize
      packed_size[sha] = objdisksize
    num_blobs += 1
    blob_size_progress.show("Processed {} blob sizes".format(num_blobs))
  cf.wait()
  blob_size_progress.finish()
  stats = {'names': collections.defaultdict(set),
           'allnames' : set(),
           'file_deletions': {},
           'tree_deletions': {},
           'equivalence': {},
           'rename_history': collections.defaultdict(set),
           'unpacked_size': unpacked_size,
           'packed_size': packed_size,
           'num_commits': 0}
  # Setup the rev-list/diff-tree process
  commit_parse_progress = ProgressWriter()
  num_commits = 0
  cmd = 'git rev-list --topo-order --reverse {} | git diff-tree --stdin --always --root --format="%H%n%P%n%cd" --date=short -M -t -c --raw --combined-all-paths'.format(' '.join(args.refs))
  dtp = subprocess.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE)
  f = dtp.stdout
  line = f.next()
  cont = bool(line)
  graph = AncestryGraph()
  while cont:
    commit = line.rstrip()
    parents = f.next().split()
    date = f.next().rstrip()
    # We expect a blank line next; if we get a non-blank line then
    # this commit modified no files and we need to move on to the next.
    # If there is no line, we've reached end-of-input.
    try:
      line = f.next().rstrip()
      cont = True
    except StopIteration:
      cont = False
    # If we haven't reached end of input, and we got a blank line  meaning
    # a commit that has modified files, then get the file changes associated
    # with this commit.
    file_changes = []
    if cont and not line:
      cont = False
      for line in f:
        if not line.startswith(':'):
          cont = True
          break
        n = 1+max(1, len(parents))
        assert line.startswith(':'*(n-1))
        relevant = line[n-1:-1]
        splits = relevant.split(None, n)
        modes = splits[0:n]
        splits = splits[n].split(None, n)
        shas = splits[0:n]
        splits = splits[n].split('\t')
        change_types = splits[0]
        filenames = [PathQuoting.dequote(x) for x in splits[1:]]
        file_changes.append([modes, shas, change_types, filenames])
    # Analyze this commit and update progress
    analyze_commit(stats, graph, commit, parents, date, file_changes)
    num_commits += 1
    commit_parse_progress.show("Processed {} commits".format(num_commits))
  # Show the final commits processed message and record the number of commits
  commit_parse_progress.finish()
  stats['num_commits'] = num_commits
  # Close the output, ensure rev-list|diff-tree pipeline completed successfully
  dtp.stdout.close()
  if dtp.wait():
    raise SystemExit("Error: rev-list|diff-tree pipeline failed; see above.")
  return stats
 def write_report(reportdir, stats):
  def datestr(datetimestr):
    return datetimestr if datetimestr else '<present>'
  def dirnames(path):
    while True:
      path = os.path.dirname(path)
      yield path
      if path == '':
        break
  # Compute aggregate size information for paths, extensions, and dirs
  total_size = {'packed': 0, 'unpacked': 0}
  path_size = {'packed': collections.defaultdict(int),
               'unpacked': collections.defaultdict(int)}
  ext_size = {'packed': collections.defaultdict(int),
              'unpacked': collections.defaultdict(int)}
  dir_size = {'packed': collections.defaultdict(int),
              'unpacked': collections.defaultdict(int)}
  for sha in stats['names']:
    size = {'packed': stats['packed_size'][sha],
            'unpacked': stats['unpacked_size'][sha]}
    for which in ('packed', 'unpacked'):
      for name in stats['names'][sha]:
        total_size[which] += size[which]
        path_size[which][name] += size[which]
        basename, ext = os.path.splitext(name)
        ext_size[which][ext] += size[which]
        for dirname in dirnames(name):
          dir_size[which][dirname] += size[which]
  # Determine if and when extensions and directories were deleted
  ext_deleted_data = {}
  for name in stats['allnames']:
    when = stats['file_deletions'].get(name, None)
    # Update the extension
    basename, ext = os.path.splitext(name)
    if when is None:
      ext_deleted_data[ext] = None
    elif ext in ext_deleted_data:
      if ext_deleted_data[ext] is not None:
        ext_deleted_data[ext] = max(ext_deleted_data[ext], when)
    else:
      ext_deleted_data[ext] = when
  dir_deleted_data = {}
  for name in dir_size['packed']:
    dir_deleted_data[name] = stats['tree_deletions'].get(name, None)
  with open(os.path.join(reportdir, "README"), 'w') as f:
    # Give a basic overview of this file
    f.write("== Overal Statistics ==\n")
    f.write("  Number of commits:         {}\n".format(stats['num_commits']))
    f.write("  Number of filenames:       {}\n".format(len(path_size['packed'])))
    f.write("  Number of directories:     {}\n".format(len(dir_size['packed'])))
    f.write("  Number of file extensions: {}\n".format(len(ext_size['packed'])))
    f.write("\n")
    f.write("  Total unpacked size (bytes): {:10d}\n"
            .format(total_size['unpacked']))
    f.write("  Total packed size (bytes):   {:10d}\n"
            .format(total_size['packed']))
    f.write("\n")
    # Mention issues with the report
    f.write("== Caveats ==\n")
    f.write("=== Sizes ===\n")
    f.write(textwrap.dedent("""
      Packed size represents what size your repository would be if no
      trees, commits, tags, or other metadata were included (though it may
      fail to represent de-duplication; see below).  It also represents the
      current packing, which may be suboptimal if you haven't gc'ed for a
      while.
      Unpacked size represents what size your repository would be if no if
      no trees, commits, tags, or other metadata were included AND if no
      files were packed; i.e., without delta-ing or compression.
      Both unpacked and packed sizes can be slightly misleading.  Deleting
      a blob from history not save as much space as the unpacked size,
      because it is obviously normally stored in packed form.  Also,
      deleting a blob from history may not save as much space as its packed
      size either, because another blob could be stored as a delta against
      that blob, so when you remove one blob another blob's packed size may
      grow.
      Also, the sum of the packed sizes can add up to more than the
      repository size; if the same contents appeared in the repository in
      multiple places, git will automatically de-dupe and store only one
      copy, while the way sizes are added in this analysis adds the size
      for each file path that has those contents.  Further, if a file is
      ever reverted to a previous version's contents, the previous
      version's size will be counted multiple times in this analysis, even
      though git will only store it once.
      """[1:]))
    f.write("\n")
    f.write("=== Deletions ===\n")
    f.write(textwrap.dedent("""
      Whether a file is deleted is not a binary quality, since it can be
      deleted on some branches but still exist in others.  Also, it might
      exist in an old tag, but have been deleted in versions newer than
      that.  More thorough tracking could be done, including looking at
      merge commits where one side of history deleted and the other modified,
      in order to give a more holistic picture of deletions.  However, that
      algorithm would not only be more complex to implement, it'd also be
      quite difficult to present and interpret by users.  Since --analyze
      is just about getting a high-level rough picture of history, it instead
      implements the simplistic rule that is good enough for 98% of cases:
        A file is marked as deleted if the last commit in the fast-export
        stream that mentions the file lists it as deleted.
      This makes it dependent on topological ordering, but generally gives
      the "right" answer.
      """[1:]))
    f.write("\n")
    f.write("=== Renames ===\n")
    f.write(textwrap.dedent("""
      Renames share the same non-binary nature that deletions do, plus
      additional challenges:
        * If the renamed file is renamed again, instead of just two names for
          a path you can have three or more.
        * Rename pairs of the form (oldname, newname) that we consider to be
          different names of the "same file" might only be valid over certain
          commit ranges.  For example, if a new commit reintroduces a file
          named oldname, then new versions of oldname aren't the "same file"
          anymore.  We could try to portray this to the user, but it's easier
          for the user to just break the pairing and only report unbroken
          rename pairings to the user.
        * The ability for users to rename files differently in different
          branches means that our chains of renames will not necessarily be
          linear but may branch out.
      """[1:]))
    f.write("\n")
  # Equivalence classes for names, so if folks only want to keep a
  # certain set of paths, they know the old names they want to include
  # too.
  with open(os.path.join(reportdir, "renames.txt"), 'w') as f:
    seen = set()
    for pathname,equiv_group in sorted(stats['equivalence'].iteritems(),
                                       key=lambda x:x[1]):
      if equiv_group in seen:
        continue
-      seen.add(equiv_group)
+      elif modes[-1] == '000000':
-      f.write("{} ->\n    ".format(equiv_group[0]) +
+        # Track when files/directories are deleted
-                   "\n    ".join(equiv_group[1:]) +
+        for f in RepoAnalyze.equiv_class(stats, filenames[-1]):
-              "\n")
+          if any(x == '040000' for x in modes[0:-1]):
            stats['tree_deletions'][f] = date
          else:
            stats['file_deletions'][f] = date
      elif change_types.strip('AMT') == '':
        RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
      elif modes[-1] == '040000' and change_types.strip('RAM') == '':
        RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
      elif change_types.strip('RAM') == '':
        RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames)
        RepoAnalyze.handle_renames(stats, commit, change_types, filenames)
      else:
        raise SystemExit("Unhandled change type(s): {} (in commit {})"
                         .format(change_types, commit))
-  # List directories in reverse sorted order of unpacked size
+  @staticmethod
-  with open(os.path.join(reportdir, "directories-deleted-sizes.txt"), 'w') as f:
+  def gather_data(args):
-    f.write("=== Deleted directories by reverse size ===\n")
+    blob_size_progress = ProgressWriter()
-    f.write("Format: unpacked size, packed size, date deleted, directory name\n")
+    num_blobs = 0
-    for dirname, size in sorted(dir_size['packed'].iteritems(),
+
-                                key=lambda x:x[1], reverse=True):
+    # Get sizes of blobs by sha1
-      if (dir_deleted_data[dirname]):
+    cmd = '--batch-check=%(objectname) %(objecttype) ' + \
          '%(objectsize) %(objectsize:disk)'
    cf = subprocess.Popen(['git', 'cat-file', '--batch-all-objects', cmd],
                          bufsize = -1,
                          stdout = subprocess.PIPE)
    unpacked_size = {}
    packed_size = {}
    for line in cf.stdout:
      sha, objtype, objsize, objdisksize = line.split()
      objsize, objdisksize = int(objsize), int(objdisksize)
      if objtype == 'blob':
        unpacked_size[sha] = objsize
        packed_size[sha] = objdisksize
      num_blobs += 1
      blob_size_progress.show("Processed {} blob sizes".format(num_blobs))
    cf.wait()
    blob_size_progress.finish()
    stats = {'names': collections.defaultdict(set),
             'allnames' : set(),
             'file_deletions': {},
             'tree_deletions': {},
             'equivalence': {},
             'rename_history': collections.defaultdict(set),
             'unpacked_size': unpacked_size,
             'packed_size': packed_size,
             'num_commits': 0}
    # Setup the rev-list/diff-tree process
    commit_parse_progress = ProgressWriter()
    num_commits = 0
    cmd = ('git rev-list --topo-order --reverse {}'.format(' '.join(args.refs)) +
           ' | git diff-tree --stdin --always --root --format=%H%n%P%n%cd' +
           ' --date=short -M -t -c --raw --combined-all-paths')
    dtp = subprocess.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE)
    f = dtp.stdout
    line = f.next()
    cont = bool(line)
    graph = AncestryGraph()
    while cont:
      commit = line.rstrip()
      parents = f.next().split()
      date = f.next().rstrip()
      # We expect a blank line next; if we get a non-blank line then
      # this commit modified no files and we need to move on to the next.
      # If there is no line, we've reached end-of-input.
      try:
        line = f.next().rstrip()
        cont = True
      except StopIteration:
        cont = False
      # If we haven't reached end of input, and we got a blank line meaning
      # a commit that has modified files, then get the file changes associated
      # with this commit.
      file_changes = []
      if cont and not line:
        cont = False
        for line in f:
          if not line.startswith(':'):
            cont = True
            break
          n = 1+max(1, len(parents))
          assert line.startswith(':'*(n-1))
          relevant = line[n-1:-1]
          splits = relevant.split(None, n)
          modes = splits[0:n]
          splits = splits[n].split(None, n)
          shas = splits[0:n]
          splits = splits[n].split('\t')
          change_types = splits[0]
          filenames = [PathQuoting.dequote(x) for x in splits[1:]]
          file_changes.append([modes, shas, change_types, filenames])
      # Analyze this commit and update progress
      RepoAnalyze.analyze_commit(stats, graph, commit, parents, date,
                                 file_changes)
      num_commits += 1
      commit_parse_progress.show("Processed {} commits".format(num_commits))
    # Show the final commits processed message and record the number of commits
    commit_parse_progress.finish()
    stats['num_commits'] = num_commits
    # Close the output, ensure rev-list|diff-tree pipeline completed successfully
    dtp.stdout.close()
    if dtp.wait():
      raise SystemExit("Error: rev-list|diff-tree pipeline failed; see above.")
    return stats
  @staticmethod
  def write_report(reportdir, stats):
    def datestr(datetimestr):
      return datetimestr if datetimestr else '<present>'
    def dirnames(path):
      while True:
        path = os.path.dirname(path)
        yield path
        if path == '':
          break
    # Compute aggregate size information for paths, extensions, and dirs
    total_size = {'packed': 0, 'unpacked': 0}
    path_size = {'packed': collections.defaultdict(int),
                 'unpacked': collections.defaultdict(int)}
    ext_size = {'packed': collections.defaultdict(int),
                'unpacked': collections.defaultdict(int)}
    dir_size = {'packed': collections.defaultdict(int),
                'unpacked': collections.defaultdict(int)}
    for sha in stats['names']:
      size = {'packed': stats['packed_size'][sha],
              'unpacked': stats['unpacked_size'][sha]}
      for which in ('packed', 'unpacked'):
        for name in stats['names'][sha]:
          total_size[which] += size[which]
          path_size[which][name] += size[which]
          basename, ext = os.path.splitext(name)
          ext_size[which][ext] += size[which]
          for dirname in dirnames(name):
            dir_size[which][dirname] += size[which]
    # Determine if and when extensions and directories were deleted
    ext_deleted_data = {}
    for name in stats['allnames']:
      when = stats['file_deletions'].get(name, None)
      # Update the extension
      basename, ext = os.path.splitext(name)
      if when is None:
        ext_deleted_data[ext] = None
      elif ext in ext_deleted_data:
        if ext_deleted_data[ext] is not None:
          ext_deleted_data[ext] = max(ext_deleted_data[ext], when)
      else:
        ext_deleted_data[ext] = when
    dir_deleted_data = {}
    for name in dir_size['packed']:
      dir_deleted_data[name] = stats['tree_deletions'].get(name, None)
    with open(os.path.join(reportdir, "README"), 'w') as f:
      # Give a basic overview of this file
      f.write("== Overal Statistics ==\n")
      f.write("  Number of commits:         {}\n".format(stats['num_commits']))
      f.write("  Number of filenames:       {}\n".format(len(path_size['packed'])))
      f.write("  Number of directories:     {}\n".format(len(dir_size['packed'])))
      f.write("  Number of file extensions: {}\n".format(len(ext_size['packed'])))
      f.write("\n")
      f.write("  Total unpacked size (bytes): {:10d}\n"
              .format(total_size['unpacked']))
      f.write("  Total packed size (bytes):   {:10d}\n"
              .format(total_size['packed']))
      f.write("\n")
      # Mention issues with the report
      f.write("== Caveats ==\n")
      f.write("=== Sizes ===\n")
      f.write(textwrap.dedent("""
        Packed size represents what size your repository would be if no
        trees, commits, tags, or other metadata were included (though it may
        fail to represent de-duplication; see below).  It also represents the
        current packing, which may be suboptimal if you haven't gc'ed for a
        while.
        Unpacked size represents what size your repository would be if no if
        no trees, commits, tags, or other metadata were included AND if no
        files were packed; i.e., without delta-ing or compression.
        Both unpacked and packed sizes can be slightly misleading.  Deleting
        a blob from history not save as much space as the unpacked size,
        because it is obviously normally stored in packed form.  Also,
        deleting a blob from history may not save as much space as its packed
        size either, because another blob could be stored as a delta against
        that blob, so when you remove one blob another blob's packed size may
        grow.
        Also, the sum of the packed sizes can add up to more than the
        repository size; if the same contents appeared in the repository in
        multiple places, git will automatically de-dupe and store only one
        copy, while the way sizes are added in this analysis adds the size
        for each file path that has those contents.  Further, if a file is
        ever reverted to a previous version's contents, the previous
        version's size will be counted multiple times in this analysis, even
        though git will only store it once.
        """[1:]))
      f.write("\n")
      f.write("=== Deletions ===\n")
      f.write(textwrap.dedent("""
        Whether a file is deleted is not a binary quality, since it can be
        deleted on some branches but still exist in others.  Also, it might
        exist in an old tag, but have been deleted in versions newer than
        that.  More thorough tracking could be done, including looking at
        merge commits where one side of history deleted and the other modified,
        in order to give a more holistic picture of deletions.  However, that
        algorithm would not only be more complex to implement, it'd also be
        quite difficult to present and interpret by users.  Since --analyze
        is just about getting a high-level rough picture of history, it instead
        implements the simplistic rule that is good enough for 98% of cases:
          A file is marked as deleted if the last commit in the fast-export
          stream that mentions the file lists it as deleted.
        This makes it dependent on topological ordering, but generally gives
        the "right" answer.
        """[1:]))
      f.write("\n")
      f.write("=== Renames ===\n")
      f.write(textwrap.dedent("""
        Renames share the same non-binary nature that deletions do, plus
        additional challenges:
          * If the renamed file is renamed again, instead of just two names for
            a path you can have three or more.
          * Rename pairs of the form (oldname, newname) that we consider to be
            different names of the "same file" might only be valid over certain
            commit ranges.  For example, if a new commit reintroduces a file
            named oldname, then new versions of oldname aren't the "same file"
            anymore.  We could try to portray this to the user, but it's easier
            for the user to just break the pairing and only report unbroken
            rename pairings to the user.
          * The ability for users to rename files differently in different
            branches means that our chains of renames will not necessarily be
            linear but may branch out.
        """[1:]))
      f.write("\n")
    # Equivalence classes for names, so if folks only want to keep a
    # certain set of paths, they know the old names they want to include
    # too.
    with open(os.path.join(reportdir, "renames.txt"), 'w') as f:
      seen = set()
      for pathname,equiv_group in sorted(stats['equivalence'].iteritems(),
                                         key=lambda x:x[1]):
        if equiv_group in seen:
          continue
        seen.add(equiv_group)
        f.write("{} ->\n    ".format(equiv_group[0]) +
                     "\n    ".join(equiv_group[1:]) +
                "\n")
    # List directories in reverse sorted order of unpacked size
    with open(os.path.join(reportdir, "directories-deleted-sizes.txt"), 'w') as f:
      f.write("=== Deleted directories by reverse size ===\n")
      f.write("Format: unpacked size, packed size, date deleted, directory name\n")
      for dirname, size in sorted(dir_size['packed'].iteritems(),
                                  key=lambda x:x[1], reverse=True):
        if (dir_deleted_data[dirname]):
          f.write("  {:10d} {:10d} {:10s} {}\n"
                  .format(dir_size['unpacked'][dirname],
                          size,
                          datestr(dir_deleted_data[dirname]),
                          dirname or '<toplevel>'))
    with open(os.path.join(reportdir, "directories-all-sizes.txt"), 'w') as f:
      f.write("=== All directories by reverse size ===\n")
      f.write("Format: unpacked size, packed size, date deleted, directory name\n")
      for dirname, size in sorted(dir_size['packed'].iteritems(),
                                  key=lambda x:x[1], reverse=True):
        f.write("  {:10d} {:10d} {:10s} {}\n"
                .format(dir_size['unpacked'][dirname],
                        size,
                        datestr(dir_deleted_data[dirname]),
                        dirname or '<toplevel>'))
-  with open(os.path.join(reportdir, "directories-all-sizes.txt"), 'w') as f:
+    # List extensions in reverse sorted order of unpacked size
-    f.write("=== All directories by reverse size ===\n")
+    with open(os.path.join(reportdir, "extensions-deleted-sizes.txt"), 'w') as f:
-    f.write("Format: unpacked size, packed size, date deleted, directory name\n")
+      f.write("=== Deleted extensions by reverse size ===\n")
-    for dirname, size in sorted(dir_size['packed'].iteritems(),
+      f.write("Format: unpacked size, packed size, date deleted, extension name\n")
-                                key=lambda x:x[1], reverse=True):
+      for extname, size in sorted(ext_size['packed'].iteritems(),
-      f.write("  {:10d} {:10d} {:10s} {}\n"
+                                  key=lambda x:x[1], reverse=True):
-              .format(dir_size['unpacked'][dirname],
+        if (ext_deleted_data[extname]):
-                      size,
+          f.write("  {:10d} {:10d} {:10s} {}\n"
-                      datestr(dir_deleted_data[dirname]),
+                  .format(ext_size['unpacked'][extname],
-                      dirname or '<toplevel>'))
+                          size,
                          datestr(ext_deleted_data[extname]),
                          extname or '<no extension>'))
-  # List extensions in reverse sorted order of unpacked size
+    with open(os.path.join(reportdir, "extensions-all-sizes.txt"), 'w') as f:
-  with open(os.path.join(reportdir, "extensions-deleted-sizes.txt"), 'w') as f:
+      f.write("=== All extensions by reverse size ===\n")
-    f.write("=== Deleted extensions by reverse size ===\n")
+      f.write("Format: unpacked size, packed size, date deleted, extension name\n")
-    f.write("Format: unpacked size, packed size, date deleted, extension name\n")
+      for extname, size in sorted(ext_size['packed'].iteritems(),
-    for extname, size in sorted(ext_size['packed'].iteritems(),
+                                  key=lambda x:x[1], reverse=True):
                                key=lambda x:x[1], reverse=True):
      if (ext_deleted_data[extname]):
        f.write("  {:10d} {:10d} {:10s} {}\n"
                .format(ext_size['unpacked'][extname],
                        size,
                        datestr(ext_deleted_data[extname]),
                        extname or '<no extension>'))
-  with open(os.path.join(reportdir, "extensions-all-sizes.txt"), 'w') as f:
+    # List files in reverse sorted order of unpacked size
-    f.write("=== All extensions by reverse size ===\n")
+    with open(os.path.join(reportdir, "path-deleted-sizes.txt"), 'w') as f:
-    f.write("Format: unpacked size, packed size, date deleted, extension name\n")
+      f.write("=== Deleted paths by reverse accumulated size ===\n")
-    for extname, size in sorted(ext_size['packed'].iteritems(),
+      f.write("Format: unpacked size, packed size, date deleted, path name(s)\n")
-                                key=lambda x:x[1], reverse=True):
+      for pathname, size in sorted(path_size['packed'].iteritems(),
-      f.write("  {:10d} {:10d} {:10s} {}\n"
+                                   key=lambda x:x[1], reverse=True):
-              .format(ext_size['unpacked'][extname],
+        when = stats['file_deletions'].get(pathname, None)
-                      size,
+        if when:
-                      datestr(ext_deleted_data[extname]),
+          f.write("  {:10d} {:10d} {:10s} {}\n"
-                      extname or '<no extension>'))
+                  .format(path_size['unpacked'][pathname],
                          size,
                          datestr(when),
                          pathname))
-  # List files in reverse sorted order of unpacked size
+    with open(os.path.join(reportdir, "path-all-sizes.txt"), 'w') as f:
-  with open(os.path.join(reportdir, "path-deleted-sizes.txt"), 'w') as f:
+      f.write("=== All paths by reverse accumulated size ===\n")
-    f.write("=== Deleted paths by reverse accumulated size ===\n")
+      f.write("Format: unpacked size, packed size, date deleted, pathectory name\n")
-    f.write("Format: unpacked size, packed size, date deleted, path name(s)\n")
+      for pathname, size in sorted(path_size['packed'].iteritems(),
-    for pathname, size in sorted(path_size['packed'].iteritems(),
+                                   key=lambda x:x[1], reverse=True):
-                                 key=lambda x:x[1], reverse=True):
+        when = stats['file_deletions'].get(pathname, None)
      when = stats['file_deletions'].get(pathname, None)
      if when:
        f.write("  {:10d} {:10d} {:10s} {}\n"
                .format(path_size['unpacked'][pathname],
                        size,
                        datestr(when),
                        pathname))
-  with open(os.path.join(reportdir, "path-all-sizes.txt"), 'w') as f:
+    # List of filenames and sizes in descending order
-    f.write("=== All paths by reverse accumulated size ===\n")
+    with open(os.path.join(reportdir, "blob-shas-and-paths.txt"), 'w') as f:
-    f.write("Format: unpacked size, packed size, date deleted, pathectory name\n")
+      f.write("== Files by sha and associated pathnames in reverse size ==\n")
-    for pathname, size in sorted(path_size['packed'].iteritems(),
+      f.write("Format: sha, unpacked size, packed size, filename(s) object stored as\n")
-                                 key=lambda x:x[1], reverse=True):
+      for sha, size in sorted(stats['packed_size'].iteritems(),
-      when = stats['file_deletions'].get(pathname, None)
+                              key=lambda x:x[1], reverse=True):
-      f.write("  {:10d} {:10d} {:10s} {}\n"
+        if sha not in stats['names']:
-              .format(path_size['unpacked'][pathname],
+          # Some objects in the repository might not be referenced, or not
-                      size,
+          # referenced by the branches/tags the user cares about; skip them.
-                      datestr(when),
+          continue
-                      pathname))
+        names_with_sha = stats['names'][sha]
        if len(names_with_sha) == 1:
          names_with_sha = names_with_sha.pop()
        else:
          names_with_sha = sorted(list(names_with_sha))
        f.write("  {} {:10d} {:10d} {}\n".format(sha,
                                                 stats['unpacked_size'][sha],
                                                 size,
                                                 names_with_sha))
-  # List of filenames and sizes in descending order
+  @staticmethod
-  with open(os.path.join(reportdir, "blob-shas-and-paths.txt"), 'w') as f:
+  def run(args, git_dir):
-    f.write("== Files by sha and associated pathnames in reverse size ==\n")
+    # Create the report directory as necessary
-    f.write("Format: sha, unpacked size, packed size, filename(s) object stored as\n")
+    results_tmp_dir = os.path.join(git_dir, 'filter-repo')
-    for sha, size in sorted(stats['packed_size'].iteritems(),
+    if not os.path.isdir(results_tmp_dir):
-                            key=lambda x:x[1], reverse=True):
+      os.mkdir(results_tmp_dir)
-      if sha not in stats['names']:
+    reportdir = os.path.join(results_tmp_dir, "analysis")
-        # Some objects in the repository might not be referenced, or not
+    if not args.force and os.path.isdir(reportdir):
-        # referenced by the branches/tags the user cares about; skip them.
+      raise SystemExit("Error: {} already exists; refusing to overwrite!".
-        continue
+                       format(reportdir))
-      names_with_sha = stats['names'][sha]
+    os.mkdir(reportdir)
      if len(names_with_sha) == 1:
        names_with_sha = names_with_sha.pop()
      else:
        names_with_sha = sorted(list(names_with_sha))
      f.write("  {} {:10d} {:10d} {}\n".format(sha,
                                               stats['unpacked_size'][sha],
                                               size,
                                               names_with_sha))
-def do_analysis(args, git_dir):
+    # Gather the data we need
-  # Create the report directory as necessary
+    stats = RepoAnalyze.gather_data(args)
  results_tmp_dir = os.path.join(git_dir, 'filter-repo')
  if not os.path.isdir(results_tmp_dir):
    os.mkdir(results_tmp_dir)
  reportdir = os.path.join(results_tmp_dir, "analysis")
  if not args.force and os.path.isdir(reportdir):
    raise SystemExit("Error: {} already exists; refusing to overwrite!".
                     format(reportdir))
  os.mkdir(reportdir)
-  # Gather the data we need
+    # Write the reports
-  stats = gather_data(args)
+    sys.stdout.write("Writing reports to {}...".format(reportdir))
-
+    sys.stdout.flush()
-  # Write the reports
+    RepoAnalyze.write_report(reportdir, stats)
-  sys.stdout.write("Writing reports to {}...".format(reportdir))
+    sys.stdout.write("done.\n")
  sys.stdout.flush()
  write_report(reportdir, stats)
  sys.stdout.write("done.\n")
 def sanity_check(refs, is_bare):
  def abort(reason):
@ -2506,7 +2524,7 @@ def run_fast_filter():
  # Do analysis, if requested
  if args.analyze:
-    do_analysis(args, git_dir)
+    RepoAnalyze.run(args, git_dir)
    return
  # Do sanity checks