filter-repo: switch --analyze to use rev-list|diff-tree pipeline

As suggested by Peff, use rev-list & diff-tree to get the information we need, instead of relying on fast-export (with some out-of-tree patches) to get that information. Signed-off-by: Elijah Newren <newren@gmail.com>
2024-07-06 18:32:14 +02:00 · 2018-11-20 10:15:46 -08:00 · 2018-11-20 10:15:46 -08:00 · 554c7e39af
commit 554c7e39af
parent beff0b958f
1 changed files with 166 additions and 105 deletions
--- a/271
+++ b/271
@ -1824,52 +1824,93 @@ def get_refs():
    output = ''
  return dict(reversed(x.split()) for x in output.splitlines())

-def analyze_commit(args, commit):
+def analyze_commit(stats, graph, commit, parents, date, file_changes):
  def equiv_class(filename):
-    return args.stats['equivalence'].get(filename, (filename,))
+    return stats['equivalence'].get(filename, (filename,))

-  for change in commit.file_changes:
-    if change.mode == '160000':
-      continue
-    if change.type == 'D':
-      # Track when files are deleted; see 'R' below about equiv_class
-      for f in equiv_class(change.filename):
-        args.stats['deletions'][f] = commit.committer_date
-    elif change.type == 'R':
-      # Since we want to know when files are deleted, renames make it slightly
-      # harder to track.  When we have a rename, track that the files are
-      # equivalent; i.e. that they refer to different versions of same file.
-      oldname, newname = change.filename
-      old_tuple = args.stats['equivalence'].get(oldname, ())
-      if newname in old_tuple:
-        continue
-      if old_tuple:
-        new_tuple = tuple(list(old_tuple)+[newname])
-      else:
-        new_tuple = (oldname, newname)
-      for f in new_tuple:
-        args.stats['equivalence'][f] = new_tuple
-      # Note, we require that we get an 'M' for every 'R' since the rename
-      # comes without information about sha1sum.  So we can handle setting
-      # a few things for newname in the 'M' section below.
-    elif change.type == 'M':
-      args.stats['names'][change.blob_id].add(change.filename)
-      args.stats['allnames'].add(change.filename)
-      # If we get an 'M', clearly the file isn't deleted anymore
-      equiv = equiv_class(change.filename)
-      for f in equiv:
-        args.stats['deletions'].pop(f, None)
-      # If we get an 'M' for a file that wasn't the latest in a rename chain,
-      # then that equivalence class isn't valid anymore.
-      if equiv[-1] != change.filename:
-        for f in equiv:
-          if f in args.stats['equivalence']:
-            del args.stats['equivalence'][f]
+  def setup_equivalence_for_rename(stats, oldname, newname):
+    # if A is renamed to B and B is renamed to C, then the user thinks of
+    # A, B, and C as all being different names for the same 'file'.  We record
+    # this as an equivalence class:
+    #   stats['equivalence'][name] = (A,B,C)
+    # for name being each of A, B, and C.
+    old_tuple = stats['equivalence'].get(oldname, ())
+    if newname in old_tuple:
+      return
+    elif old_tuple:
+      new_tuple = tuple(list(old_tuple)+[newname])
    else:
-      raise SystemExit("Unhandled change type: {}".format(change.type))
+      new_tuple = (oldname, newname)
+    for f in new_tuple:
+      stats['equivalence'][f] = new_tuple

-  # We're just gathering data; don't spend time dumping the commit
-  commit.dumped = 2
+  def setup_or_update_rename_history(stats, commit, oldname, newname):
+    rename_commits = stats['rename_history'].get(oldname, set())
+    rename_commits.add(commit)
+    stats['rename_history'][oldname] = rename_commits
+
+  def handle_renames(stats, commit, change_types, filenames):
+    for index, change_type in enumerate(change_types):
+      if change_type == 'R':
+        oldname, newname = filenames[index], filenames[-1]
+        setup_equivalence_for_rename(stats, oldname, newname)
+        setup_or_update_rename_history(stats, commit, oldname, newname)
+
+  def handle_file(stats, graph, commit, modes, shas, filenames):
+    mode, sha, filename = modes[-1], shas[-1], filenames[-1]
+
+    # Figure out kind of deletions to undo for this file, and update lists
+    # of all-names-by-sha and all-filenames
+    delmode = 'tree_deletions'
+    if mode != '040000':
+      delmode = 'file_deletions'
+      stats['names'][sha].add(filename)
+      stats['allnames'].add(filename)
+
+    # If the file (or equivalence class of files) was recorded as deleted,
+    # clearly it isn't anymore
+    equiv = equiv_class(filename)
+    for f in equiv:
+      stats[delmode].pop(f, None)
+
+    # If we get a modify/add for a path that was renamed, we may need to break
+    # the equivalence class.  However, if the modify/add was on a branch that
+    # doesn't have the rename in its history, we are still okay.
+    need_to_break_equivalence = False
+    if equiv[-1] != filename:
+      for rename_commit in stats['rename_history'][filename]:
+        if graph.is_ancestor(rename_commit, commit):
+          need_to_break_equivalence = True
+
+    if need_to_break_equivalence:
+      for f in equiv:
+        if f in stats['equivalence']:
+          del stats['equivalence'][f]
+
+  graph.add_commit_and_parents(commit, parents)
+  for change in file_changes:
+    modes, shas, change_types, filenames = change
+    if len(parents) == 1 and change_types.startswith('R'):
+      change_types = 'R'  # remove the rename score; we don't care
+    if modes[-1] == '160000':
+      continue
+    elif modes[-1] == '000000':
+      # Track when files/directories are deleted; see 'R' below about equiv_class
+      for f in equiv_class(filenames[-1]):
+        if any(x == '040000' for x in modes[0:-1]):
+          stats['tree_deletions'][f] = date
+        else:
+          stats['file_deletions'][f] = date
+    elif change_types.strip('AMT') == '':
+      handle_file(stats, graph, commit, modes, shas, filenames)
+    elif modes[-1] == '040000' and change_types.strip('RAM') == '':
+      handle_file(stats, graph, commit, modes, shas, filenames)
+    elif change_types.strip('RAM') == '':
+      handle_file(stats, graph, commit, modes, shas, filenames)
+      handle_renames(stats, commit, change_types, filenames)
+    else:
+      raise SystemExit("Unhandled change type(s): {} (in commit {})"
+                       .format(change_types, commit))

 def gather_data(args):
  blob_size_progress = ProgressWriter()
@ -1893,36 +1934,74 @@ def gather_data(args):
  blob_size_progress.finish()
  stats = {'names': collections.defaultdict(set),
           'allnames' : set(),
-           'deletions': {},
+           'file_deletions': {},
+           'tree_deletions': {},
           'equivalence': {},
+           'rename_history': collections.defaultdict(set),
           'unpacked_size': unpacked_size,
-           'packed_size': packed_size}
+           'packed_size': packed_size,
+           'num_commits': 0}

-  # Setup the fast-export process
-  fep_cmd = ['git', 'fast-export',
-             '-M',
-             '--no-data',
-             '--show-original-ids',
-             '--always-show-modify-after-rename',
-             '--signed-tags=strip',
-             '--tag-of-filtered-object=rewrite',
-             '--use-done-feature'] + args.refs
-  fep = subprocess.Popen(fep_cmd, stdout=subprocess.PIPE)
-  input = fep.stdout
-  output = open(os.devnull, 'w')
+  # Setup the rev-list/diff-tree process
+  commit_parse_progress = ProgressWriter()
+  num_commits = 0
+  cmd = 'git rev-list --topo-order --reverse {} | git diff-tree --stdin --always --root --format="%H%n%P%n%cd" --date=short -M -t -c --raw --combined-all-paths'.format(' '.join(args.refs))
+  dtp = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
+  f = dtp.stdout
+  line = f.next()
+  cont = bool(line)
+  graph = AncestryGraph()
+  while cont:
+    commit = line.rstrip()
+    parents = f.next().split()
+    date = f.next().rstrip()

-  # Create and run the filter
-  setattr(args, 'stats', stats)
-  analyze_filter = FastExportFilter(
-                     commit_callback = lambda c : analyze_commit(args, c),
-                     )
-  analyze_filter.run(input, output, quiet = args.quiet)
-  setattr(args, 'num_commits', analyze_filter.num_commits_parsed())
+    # We expect a blank line next; if we get a non-blank line then
+    # this commit modified no files and we need to move on to the next.
+    # If there is no line, we've reached end-of-input.
+    try:
+      line = f.next().rstrip()
+      cont = True
+    except StopIteration:
+      cont = False

-  # Close the output, ensure fast-export have completed
-  output.close()
-  if fep.wait():
-    raise SystemExit("Error: fast-export failed; see above.")
+    # If we haven't reached end of input, and we got a blank line  meaning
+    # a commit that has modified files, then get the file changes associated
+    # with this commit.
+    file_changes = []
+    if cont and not line:
+      cont = False
+      for line in f:
+        if not line.startswith(':'):
+          cont = True
+          break
+        n = 1+max(1, len(parents))
+        assert line.startswith(':'*(n-1))
+        relevant = line[n-1:-1]
+        splits = relevant.split(None, n)
+        modes = splits[0:n]
+        splits = splits[n].split(None, n)
+        shas = splits[0:n]
+        splits = splits[n].split('\t')
+        change_types = splits[0]
+        filenames = [PathQuoting.dequote(x) for x in splits[1:]]
+        file_changes.append([modes, shas, change_types, filenames])
+
+    # Analyze this commit and update progress
+    analyze_commit(stats, graph, commit, parents, date, file_changes)
+    num_commits += 1
+    commit_parse_progress.show("Processed {} commits".format(num_commits))
+
+  # Show the final commits processed message and record the number of commits
+  commit_parse_progress.finish()
+  stats['num_commits'] = num_commits
+
+  # Close the output, ensure rev-list|diff-tree pipeline completed successfully
+  dtp.stdout.close()
+  if dtp.wait():
+    raise SystemExit("Error: rev-list|diff-tree pipeline failed; see above.")
+
+  return stats

 def do_analysis(args, git_dir):
  # Create the report file as necessary
@ -1936,10 +2015,10 @@ def do_analysis(args, git_dir):
  os.mkdir(reportdir)

  # Now gather the data we need
-  gather_data(args)
+  stats = gather_data(args)

-  def datestr(datetimeobj):
-    return datetimeobj.strftime('%F') if datetimeobj else '<present>'
+  def datestr(datetimestr):
+    return datetimestr if datetimestr else '<present>'

  def dirnames(path):
    while True:
@ -1956,11 +2035,11 @@ def do_analysis(args, git_dir):
              'unpacked': collections.defaultdict(int)}
  dir_size = {'packed': collections.defaultdict(int),
              'unpacked': collections.defaultdict(int)}
-  for sha in args.stats['names']:
-    size = {'packed': args.stats['packed_size'][sha],
-            'unpacked': args.stats['unpacked_size'][sha]}
+  for sha in stats['names']:
+    size = {'packed': stats['packed_size'][sha],
+            'unpacked': stats['unpacked_size'][sha]}
    for which in ('packed', 'unpacked'):
-      for name in args.stats['names'][sha]:
+      for name in stats['names'][sha]:
        total_size[which] += size[which]
        path_size[which][name] += size[which]
        basename, ext = os.path.splitext(name)
@ -1970,9 +2049,8 @@ def do_analysis(args, git_dir):

  # Determine if and when extensions and directories were deleted
  ext_deleted_data = {}
-  dir_deleted_data = {}
-  for name in args.stats['allnames']:
-    when = args.stats['deletions'].get(name, None)
+  for name in stats['allnames']:
+    when = stats['file_deletions'].get(name, None)

    # Update the extension
    basename, ext = os.path.splitext(name)
@ -1984,20 +2062,14 @@ def do_analysis(args, git_dir):
    else:
      ext_deleted_data[ext] = when

-    # Update the dirs
-    for dirname in dirnames(name):
-      if when is None:
-        dir_deleted_data[dirname] = None
-      elif dirname in dir_deleted_data:
-        if dir_deleted_data[dirname] is not None:
-          dir_deleted_data[dirname] = max(dir_deleted_data[dirname], when)
-      else:
-        dir_deleted_data[dirname] = when
+  dir_deleted_data = {}
+  for name in dir_size['packed']:
+    dir_deleted_data[name] = stats['tree_deletions'].get(name, None)

  with open(os.path.join(reportdir, "README"), 'w') as f:
    # Give a basic overview of this file
    f.write("== Overal Statistics ==\n")
-    f.write("  Number of commits:         {}\n".format(args.num_commits))
+    f.write("  Number of commits:         {}\n".format(stats['num_commits']))
    f.write("  Number of filenames:       {}\n".format(len(path_size['packed'])))
    f.write("  Number of directories:     {}\n".format(len(dir_size['packed'])))
    f.write("  Number of file extensions: {}\n".format(len(ext_size['packed'])))
@ -2071,17 +2143,6 @@ def do_analysis(args, git_dir):
          anymore.  We could try to portray this to the user, but it's easier
          for the user to just break the pairing and only report unbroken
          rename pairings to the user.
-        * Since modifying a renamed file on the side of history that doesn't
-          rename it should be expected to be common (unlike modifying a deleted
-          file on the side of history that doesn't delete it), tracking history
-          becomes more important to avoid incorrectly breaking rename chains.
-          This has not yet been implemented.  This seriously raises the risk
-          of erroneously breaking rename pairings; a future release may address
-          this shortcoming.
-        * We only use rename detection, not copy detection.  However, that
-          means that if some commit in history renamed two files into the same
-          location, we won't pick up one of the two renames and will instead
-          report that branch as having been deleted.
        * The ability for users to rename files differently in different
          branches means that our chains of renames will not necessarily be
          linear but may branch out.
@ -2093,7 +2154,7 @@ def do_analysis(args, git_dir):
  # too.
  with open(os.path.join(reportdir, "renames.txt"), 'w') as f:
    seen = set()
-    for pathname,equiv_group in sorted(args.stats['equivalence'].iteritems(),
+    for pathname,equiv_group in sorted(stats['equivalence'].iteritems(),
                                       key=lambda x:x[1]):
      if equiv_group in seen:
        continue
@ -2156,7 +2217,7 @@ def do_analysis(args, git_dir):
    f.write("Format: unpacked size, packed size, date deleted, path name(s)\n")
    for pathname, size in sorted(path_size['packed'].iteritems(),
                                 key=lambda x:x[1], reverse=True):
-      when = args.stats['deletions'].get(pathname, None)
+      when = stats['file_deletions'].get(pathname, None)
      if when:
        f.write("  {:10d} {:10d} {:10s} {}\n"
                .format(path_size['unpacked'][pathname],
@ -2169,7 +2230,7 @@ def do_analysis(args, git_dir):
    f.write("Format: unpacked size, packed size, date deleted, pathectory name\n")
    for pathname, size in sorted(path_size['packed'].iteritems(),
                                 key=lambda x:x[1], reverse=True):
-      when = args.stats['deletions'].get(pathname, None)
+      when = stats['file_deletions'].get(pathname, None)
      f.write("  {:10d} {:10d} {:10s} {}\n"
              .format(path_size['unpacked'][pathname],
                      size,
@ -2180,19 +2241,19 @@ def do_analysis(args, git_dir):
  with open(os.path.join(reportdir, "blob-shas-and-paths.txt"), 'w') as f:
    f.write("== Files by sha and associated pathnames in reverse size ==\n")
    f.write("Format: sha, unpacked size, packed size, filename(s) object stored as\n")
-    for sha, size in sorted(args.stats['packed_size'].iteritems(),
+    for sha, size in sorted(stats['packed_size'].iteritems(),
                            key=lambda x:x[1], reverse=True):
-      if sha not in args.stats['names']:
+      if sha not in stats['names']:
        # Some objects in the repository might not be referenced, or not
        # referenced by the branches/tags the user cares about; skip them.
        continue
-      names_with_sha = args.stats['names'][sha]
+      names_with_sha = stats['names'][sha]
      if len(names_with_sha) == 1:
        names_with_sha = names_with_sha.pop()
      else:
        names_with_sha = sorted(list(names_with_sha))
      f.write("  {} {:10d} {:10d} {}\n".format(sha,
-                                               args.stats['unpacked_size'][sha],
+                                               stats['unpacked_size'][sha],
                                               size,
                                               names_with_sha))