From 6ca3d7c1c77d10ab3922975391452b3eb9998c97 Mon Sep 17 00:00:00 2001
From: Elijah Newren <newren@gmail.com>
Date: Sun, 21 Oct 2018 07:44:33 -0700
Subject: [PATCH] filter-repo: add --analyze option

This option walks through the repository history and creates a report
with basic statistics, rename related information, and sizes of objects
and when/if those have been deleted.  It primarily looks at unpacked
sizes (i.e. size of object ignoring delta-ing and compression), and
sums the size of each version of the file for each path.  Additionally,
it aggregates these sums by extension and by directory, and tracks
whether paths, extensions, and directories have been deleted.  This can
be very useful in determining what the big things are, and whether they
might have been considered to have been mistakes to add to the
repository in the first place.

There are numerous caveats with the determination of "deleted" and
"renamed", and can give both false positives and false negatives.  But
they are only meant as a helpful heuristic to give others a starting
point for an investigation, and the information provide so far is useful.
I do want to improve the equivalence classes (rename handling), but that
is for a future commit.

Signed-off-by: Elijah Newren <newren@gmail.com>
---
 git-filter-repo | 365 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 365 insertions(+)

diff --git a/git-filter-repo b/git-filter-repo
index ec44f7a..d50d027 100755
--- a/git-filter-repo
+++ b/git-filter-repo
@@ -356,6 +356,12 @@ class FileChanges(_GitElement):
       self.mode = mode
       self.blob_id = id_
 
+    # For 'R' file changes (rename), expect to have newname as third arg
+    elif type_ == 'R':
+      if id_ is None:
+        raise SystemExit("new name needed for rename of %s" % filename)
+      self.filename = (self.filename, id_)
+
   def dump(self, file_):
     """
     Write this file-change element to a file
@@ -775,6 +781,20 @@ class FastExportFilter(object):
         path = unquote(path)
       filechange = FileChanges('D', path)
       self._advance_currentline()
+    elif self._currentline.startswith('R '):
+      rest = self._currentline[2:-1]
+      if rest.startswith('"'):
+        m = re.match(r'"(?:[^"\\]|\\.)*"', rest)
+        if not m:
+          raise SystemExit("Couldn't parse rename source")
+        orig = unquote(m.group(0))
+        new = rest[m.end()+1:]
+      else:
+        orig, new = rest.split(' ', 1)
+      if new.startswith('"'):
+        new = unquote(new)
+      filechange = FileChanges('R', orig, new)
+      self._advance_currentline()
     return filechange
 
   def _parse_original_id(self):
@@ -913,6 +933,9 @@ class FastExportFilter(object):
     else:
       return new_hash[0:orig_len]
 
+  def num_commits_parsed(self):
+    return self._num_commits
+
   def _show_progress(self, force=False):
     if not self._quiet:
       now = time.time()
@@ -1482,6 +1505,10 @@ def get_args():
   parser = argparse.ArgumentParser(description='Rewrite repository history')
   # FIXME: Need to special case all --* args that rev-list takes, or call
   # git rev-parse ...
+  parser.add_argument('--analyze', action='store_true',
+                     help='''Analyze repository history and create a report
+                             that may be useful in determining what to
+                             filter in a subsequent run.''')
   parser.add_argument('--force', '-f', action='store_true',
                       help='''Rewrite history even if the current repo does not
                               look like a fresh clone.''')
@@ -1552,6 +1579,11 @@ def get_args():
   args = parser.parse_args()
   if not args.revisions:
     args.revisions = ['--all']
+  if args.analyze and args.path_changes:
+    raise SystemExit("Error: --analyze is incompatible with --path* flags; "
+                     "it's a read-only operation.")
+  if args.analyze and args.stdin:
+    raise SystemExit("Error: --analyze is incompatible with --stdin.")
   # If no path_changes are found, initialize with empty list but mark as
   # not inclusive so that all files match
   if args.path_changes == None:
@@ -1647,6 +1679,334 @@ def get_refs():
     output = ''
   return dict(reversed(x.split()) for x in output.splitlines())
 
+def analyze_commit(args, commit):
+  def equiv_class(filename):
+    return args.stats['equivalence'].get(filename, (filename,))
+
+  for change in commit.file_changes:
+    if change.mode == '160000':
+      continue
+    if change.type == 'D':
+      # Track when files are deleted; see 'R' below about equiv_class
+      for f in equiv_class(change.filename):
+        args.stats['deletions'][f] = commit.committer_date
+    elif change.type == 'R':
+      # Since we want to know when files are deleted, renames make it slightly
+      # harder to track.  When we have a rename, track that the files are
+      # equivalent; i.e. that they refer to different versions of same file.
+      oldname, newname = change.filename
+      old_tuple = args.stats['equivalence'].get(oldname, ())
+      if newname in old_tuple:
+        continue
+      if old_tuple:
+        new_tuple = tuple(list(old_tuple)+[newname])
+      else:
+        new_tuple = (oldname, newname)
+      for f in new_tuple:
+        args.stats['equivalence'][f] = new_tuple
+      # Note, we require that we get an 'M' for every 'R' since the rename
+      # comes without information about sha1sum.  So we can handle setting
+      # a few things for newname in the 'M' section below.
+    elif change.type == 'M':
+      args.stats['names'][change.blob_id].add(change.filename)
+      args.stats['allnames'].add(change.filename)
+      # If we get an 'M', clearly the file isn't deleted anymore
+      equiv = equiv_class(change.filename)
+      for f in equiv:
+        args.stats['deletions'].pop(f, None)
+      # If we get an 'M' for a file that wasn't the latest in a rename chain,
+      # then that equivalence class isn't valid anymore.
+      if equiv[-1] != change.filename:
+        for f in equiv:
+          if f in args.stats['equivalence']:
+            del args.stats['equivalence'][f]
+    else:
+      raise SystemExit("Unhandled change type: {}".format(change.type))
+
+  # We're just gathering data; don't spend time dumping the commit
+  commit.dumped = 2
+
+def gather_data(args):
+  # Get sizes of blobs by sha1
+  cf = subprocess.Popen('git cat-file --batch-check --batch-all-objects'.split(),
+                        stdout = subprocess.PIPE)
+  size = {}
+  for line in cf.stdout:
+    sha, objtype, shasize = line.split()
+    shasize = int(shasize)
+    if objtype == 'blob':
+      size[sha] = shasize
+  stats = {'names': collections.defaultdict(set),
+           'allnames' : set(),
+           'deletions': {},
+           'equivalence': {},
+           'size': size}
+
+  # Setup the fast-export process
+  fep_cmd = ['git', 'fast-export',
+             '-M',
+             '--no-data',
+             '--show-original-ids',
+             '--always-show-modify-after-rename',
+             '--signed-tags=strip',
+             '--tag-of-filtered-object=rewrite',
+             '--use-done-feature'] + args.revisions
+  fep = subprocess.Popen(fep_cmd, stdout=subprocess.PIPE)
+  input = fep.stdout
+  output = open(os.devnull, 'w')
+
+  # Create and run the filter
+  setattr(args, 'size', size)
+  setattr(args, 'stats', stats)
+  analyze_filter = FastExportFilter(
+                     commit_callback = lambda c : analyze_commit(args, c),
+                     )
+  analyze_filter.run(input, output, quiet = args.quiet)
+  setattr(args, 'num_commits', analyze_filter.num_commits_parsed())
+
+  # Close the output, ensure fast-export have completed
+  output.close()
+  if fep.wait():
+    raise SystemExit("Error: fast-export failed; see above.")
+  cf.wait()
+
+def do_analysis(args, git_dir):
+  # Create the report file as necessary
+  results_tmp_dir = os.path.join(git_dir, 'filter-repo')
+  if not os.path.isdir(results_tmp_dir):
+    os.mkdir(results_tmp_dir)
+  reportfile = os.path.join(results_tmp_dir,
+                            "repo-analysis-{}.txt".format(time.strftime("%F")))
+  if not args.force and os.path.isfile(reportfile):
+    raise SystemExit("Error: {} already exists; refusing to overwrite!".
+                     format(reportfile))
+
+  # Now gather the data we need
+  gather_data(args)
+
+  def datestr(datetimeobj):
+    return datetimeobj.strftime('%F') if datetimeobj else '<present>'
+
+  def dirnames(path):
+    while True:
+      path = os.path.dirname(path)
+      yield path
+      if path == '':
+        break
+
+  # Compute aggregate unpacked size information for paths, extensions, and dirs
+  total_size = 0
+  path_size = collections.defaultdict(int)
+  ext_size = collections.defaultdict(int)
+  dir_size  = collections.defaultdict(int)
+  for sha in args.stats['names']:
+    size = args.size[sha]
+    for name in args.stats['names'][sha]:
+      total_size += size
+      path_size[name] += size
+      basename, ext = os.path.splitext(name)
+      ext_size[ext] += size
+      for dirname in dirnames(name):
+        dir_size[dirname] += size
+
+  # Determine if and when extensions and directories were deleted
+  ext_deleted_data = {}
+  dir_deleted_data = {}
+  for name in args.stats['allnames']:
+    when = args.stats['deletions'].get(name, None)
+
+    # Update the extension
+    basename, ext = os.path.splitext(name)
+    if when is None:
+      ext_deleted_data[ext] = None
+    elif ext in ext_deleted_data:
+      if ext_deleted_data[ext] is not None:
+        ext_deleted_data[ext] = max(ext_deleted_data[ext], when)
+    else:
+      ext_deleted_data[ext] = when
+
+    # Update the dirs
+    for dirname in dirnames(name):
+      if when is None:
+        dir_deleted_data[dirname] = None
+      elif dirname in dir_deleted_data:
+        if dir_deleted_data[dirname] is not None:
+          dir_deleted_data[dirname] = max(dir_deleted_data[dirname], when)
+      else:
+        dir_deleted_data[dirname] = when
+
+  with open(reportfile, 'w') as f:
+    # Give a basic overview of this file
+    f.write("== Table of Contents ==\n")
+    f.write("  * Overal Statistics\n")
+    f.write("  * Caveats\n")
+    f.write("  * File renames\n")
+    f.write("  * Directory sizes\n")
+    f.write("    * Deleted directories\n")
+    f.write("    * All directories\n")
+    f.write("  * Filename extension sizes\n")
+    f.write("    * Deleted extensions\n")
+    f.write("    * All extensions\n")
+    f.write("  * Path sizes (accumulated across commits)\n")
+    f.write("    * Deleted paths\n")
+    f.write("    * All paths\n")
+    f.write("  * Files by sha and associated pathnames\n")
+    f.write("\n")
+
+    # Provide total unpacked size
+    f.write("== Overal Statistics ==\n")
+    f.write("  Number of commits:         {}\n".format(args.num_commits))
+    f.write("  Number of filenames:       {}\n".format(len(path_size)))
+    f.write("  Number of directories:     {}\n".format(len(dir_size)))
+    f.write("  Number of file extensions: {}\n".format(len(ext_size)))
+    f.write("\n")
+    f.write("  Total unpacked size: {}\n".format(total_size))
+    f.write("\n")
+    f.write("  (Unpacked size represents what size your repository would be\n")
+    f.write("   if no trees, commits, tags, or other metadata were included\n")
+    f.write("   AND if no files were packed; i.e., without delta-ing and\n")
+    f.write("   without compression.)\n")
+    f.write("\n")
+
+    # Mention issues with the report
+    f.write("== Caveats ==\n")
+    f.write("=== Deletions ===\n")
+    f.write(textwrap.dedent("""
+      Whether a file is deleted is not a binary quality, since it can be
+      deleted on some branches but still exist in others.  Also, it might
+      exist in an old tag, but have been deleted in versions newer than
+      that.  More thorough tracking could be done, including looking at
+      merge commits where one side of history deleted and the other modified,
+      in order to give a more holistic picture of deletions.  However, that
+      algorithm would not only be more complex to implement, it'd also be
+      quite difficult to present and interpret by users.  Since --analyze
+      is just about getting a high-level rough picture of history, it instead
+      implements the simplistic rule that is good enough for 98% of cases:
+        A file is marked as deleted if the last commit in the fast-export
+        stream that mentions the file lists it as deleted.
+      This makes it dependent on topological ordering, but generally gives
+      the "right" answer.
+      """[1:]))
+    f.write("=== Renames ===\n")
+    f.write(textwrap.dedent("""
+      Renames share the same non-binary nature that deletions do, plus
+      additional challenges:
+        * If the renamed file is renamed again, instead of just two names for
+          a path you can have three or more.
+        * Rename pairs of the form (oldname, newname) that we consider to be
+          different names of the "same file" might only be valid over certain
+          commit ranges.  For example, if a new commit reintroduces a file
+          named oldname, then new versions of oldname aren't the "same file"
+          anymore.  We could try to portray this to the user, but it's easier
+          for the user to just break the pairing and only report unbroken
+          rename pairings to the user.
+        * Since modifying a renamed file on the side of history that doesn't
+          rename it should be expected to be common (unlike modifying a deleted
+          file on the side of history that doesn't delete it), tracking history
+          becomes more important to avoid incorrectly breaking rename chains.
+          This has not yet been implemented.  This seriously raises the risk
+          of erroneously breaking rename pairings; a future release may address
+          this shortcoming.
+        * We only use rename detection, not copy detection.  However, that
+          means that if some commit in history renamed two files into the same
+          location, we won't pick up one of the two renames and will instead
+          report that branch as having been deleted.
+        * The ability for users to rename files differently in different
+          branches means that our chains of renames will not necessarily be
+          linear but may branch out.
+      """[1:]))
+    f.write("\n")
+
+    # Equivalence classes for names, so if folks only want to keep a
+    # certain set of paths, they know the old names they want to include
+    # too.
+    f.write("== File renames ==\n")
+    seen = set()
+    for pathname,equiv_group in sorted(args.stats['equivalence'].iteritems(),
+                                       key=lambda x:x[1]):
+      if equiv_group in seen:
+        continue
+      seen.add(equiv_group)
+      f.write("  {} ->\n    ".format(equiv_group[0]) +
+                     "\n    ".join(equiv_group[1:]) +
+              "\n")
+    f.write("\n")
+
+    # List directories in reverse sorted order of unpacked size
+    f.write("== Directory sizes ==\n")
+    f.write("=== Deleted directories by reverse size ===\n")
+    f.write("Format: size (bytes), date deleted, directory name\n")
+    for dirname, size in sorted(dir_size.iteritems(),
+                                key=lambda x:x[1], reverse=True):
+      if (dir_deleted_data[dirname]):
+        f.write("  {:10d} {:10s} {}\n".format(size,
+                                              datestr(dir_deleted_data[dirname]),
+                                              dirname or '<toplevel>'))
+    f.write("\n")
+    f.write("=== All directories by reverse size ===\n")
+    f.write("Format: size (bytes), date deleted, directory name\n")
+    for dirname, size in sorted(dir_size.iteritems(),
+                                key=lambda x:x[1], reverse=True):
+      f.write("  {:10d} {:10s} {}\n".format(size, datestr(dir_deleted_data[dirname]),
+                                        dirname or '<toplevel>'))
+    f.write("\n")
+
+    # List extensions in reverse sorted order of unpacked size
+    f.write("== Filename extension sizes ==\n")
+    f.write("=== Deleted extensions by reverse size ===\n")
+    f.write("Format: size (bytes), date deleted, extension name\n")
+    for extname, size in sorted(ext_size.iteritems(),
+                                key=lambda x:x[1], reverse=True):
+      if (ext_deleted_data[extname]):
+        f.write("  {:10d} {:10s} {}\n".format(size,
+                                              datestr(ext_deleted_data[extname]),
+                                              extname or '<no extension>'))
+    f.write("\n")
+    f.write("=== All extensions by reverse size ===\n")
+    f.write("Format: size (bytes), date deleted, extension name\n")
+    for extname, size in sorted(ext_size.iteritems(),
+                                key=lambda x:x[1], reverse=True):
+      f.write("  {:10d} {:10s} {}\n".format(size,
+                                            datestr(ext_deleted_data[extname]),
+                                            extname or '<no extension>'))
+    f.write("\n")
+
+    # List files in reverse sorted order of unpacked size
+    f.write("== Path sizes (accumulated across commits) ==\n")
+    f.write("=== Deleted paths by reverse size ===\n")
+    f.write("Format: size (bytes), date deleted, path name(s)\n")
+    for pathname, size in sorted(path_size.iteritems(),
+                                 key=lambda x:x[1], reverse=True):
+      when = args.stats['deletions'].get(pathname, None)
+      if when:
+        f.write("  {:10d} {:10s} {}\n".format(size, datestr(when), pathname))
+    f.write("\n")
+    f.write("=== All paths by reverse size ===\n")
+    f.write("Format: size (bytes), date deleted, pathectory name\n")
+    for pathname, size in sorted(path_size.iteritems(),
+                                key=lambda x:x[1], reverse=True):
+      when = args.stats['deletions'].get(pathname, None)
+      f.write("  {:10d} {:10s} {}\n".format(size, datestr(when), pathname))
+    f.write("\n")
+
+    # List of filenames and sizes in descending order
+    f.write("== Files by sha and associated pathnames in reverse size ==\n")
+    f.write("Format: sha, size (bytes), filename(s) object stored as\n")
+    for sha, size in sorted(args.size.iteritems(), key=lambda x:x[1],
+                            reverse=True):
+      if sha not in args.stats['names']:
+        # Some objects in the repository might not be referenced, or not
+        # referenced by the branches/tags the user cares about; skip them.
+        continue
+      names_with_sha = args.stats['names'][sha]
+      if len(names_with_sha) == 1:
+        names_with_sha = names_with_sha.pop()
+      else:
+        names_with_sha = sorted(list(names_with_sha))
+      f.write("  {} {:9d} {}\n".format(sha, size, names_with_sha))
+    f.write("\n")
+  print("Report written to {}".format(reportfile))
+
 def tweak_commit(args, commit):
   def filename_matches(path_expression, pathname):
     if path_expression == '':
@@ -1737,6 +2097,11 @@ def run_fast_filter():
   is_bare = is_repository_bare()
   git_dir = determine_git_dir()
 
+  # Do analysis, if requested
+  if args.analyze:
+    do_analysis(args, git_dir)
+    return
+
   # Do sanity checks
   if not args.force:
     sanity_check(orig_refs, is_bare)