filter-repo: buffer subprocess stdout to significantly improve performance

Apparently, the default for subprocess stdout is unbuffered; switching it to buffered yields a huge 40% speedup. Doing this also exposes the need to add fi_input.flush() calls, highlighting another performance issue. We may be able to have fewer such calls with some refactoring, but that is a bigger separate change. Just having them highlighted to remind about them as a performance issue is good for now. Signed-off-by: Elijah Newren <newren@gmail.com>
2024-07-04 01:15:41 +02:00 · 2018-12-18 16:18:47 -08:00 · 2018-12-18 16:18:47 -08:00 · 03507e57f5
commit 03507e57f5
parent 9ebd3117ca
1 changed files with 10 additions and 3 deletions
--- a/13
+++ b/13
@ -1120,6 +1120,7 @@ class FastExportFilter(object):
      # buffers filling up so I instead read from it as I go.
      for change in file_changes:
        fi_input.write("ls :{} {}\n".format(from_commit, change.filename))
+        fi_input.flush()
        parent_version = fi_output.readline().split()
        if change.type == 'D':
          if parent_version == ['missing', change.filename]:
@ -1128,6 +1129,7 @@ class FastExportFilter(object):
          blob_sha = change.blob_id
          if isinstance(change.blob_id, int):
            fi_input.write("get-mark :{}\n".format(change.blob_id))
+            fi_input.flush()
            blob_sha = fi_output.readline().rstrip()
          if parent_version == [change.mode, 'blob', blob_sha, change.filename]:
            unnecessary_filechanges.add(change)
@ -1173,6 +1175,7 @@ class FastExportFilter(object):
        if commit.original_id and fast_import_pipes:
          fi_input, fi_output = fast_import_pipes
          fi_input.write("get-mark :{}\n".format(commit.id))
+          fi_input.flush()
          orig_id = commit.original_id
          new_id = fi_output.readline().rstrip()
          self._commit_renames[orig_id] = new_id
@ -1531,6 +1534,7 @@ def get_commit_count(repo, *args):
  if len(args) == 1 and isinstance(args[0], list):
    args = args[0]
  p1 = subprocess.Popen(["git", "rev-list"] + args,
+                        bufsize=-1,
                        stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                        cwd=repo)
  p2 = subprocess.Popen(["wc", "-l"], stdin=p1.stdout, stdout=subprocess.PIPE)
@ -1919,6 +1923,7 @@ def gather_data(args):
  # Get sizes of blobs by sha1
  a='--batch-check=%(objectname) %(objecttype) %(objectsize) %(objectsize:disk)'
  cf = subprocess.Popen(['git', 'cat-file', '--batch-all-objects', a],
+                        bufsize = -1,
                        stdout = subprocess.PIPE)
  unpacked_size = {}
  packed_size = {}
@ -1946,7 +1951,7 @@ def gather_data(args):
  commit_parse_progress = ProgressWriter()
  num_commits = 0
  cmd = 'git rev-list --topo-order --reverse {} | git diff-tree --stdin --always --root --format="%H%n%P%n%cd" --date=short -M -t -c --raw --combined-all-paths'.format(' '.join(args.refs))
-  dtp = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
+  dtp = subprocess.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE)
  f = dtp.stdout
  line = f.next()
  cont = bool(line)
@ -2400,7 +2405,7 @@ def run_fast_filter():
               '--tag-of-filtered-object=rewrite',
               '--no-data',
               '--use-done-feature'] + args.refs
-    fep = subprocess.Popen(fep_cmd, stdout=subprocess.PIPE)
+    fep = subprocess.Popen(fep_cmd, bufsize=-1, stdout=subprocess.PIPE)
    input = fep.stdout
    if args.dry_run or args.debug:
      fe_orig = os.path.join(results_tmp_dir, 'fast-export.original')
@ -2414,7 +2419,9 @@ def run_fast_filter():
  pipes = None
  if not args.dry_run:
    fip_cmd = 'git fast-import --force --quiet'.split()
-    fip = subprocess.Popen(fip_cmd, stdin=subprocess.PIPE,
+    fip = subprocess.Popen(fip_cmd,
+                           bufsize=-1,
+                           stdin=subprocess.PIPE,
                           stdout=subprocess.PIPE)
    pipes = (fip.stdin, fip.stdout)
  if args.dry_run or args.debug: