From 03507e57f5a07499b1e0341c079eecccf21c0967 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Tue, 18 Dec 2018 16:18:47 -0800 Subject: [PATCH] filter-repo: buffer subprocess stdout to significantly improve performance Apparently, the default for subprocess stdout is unbuffered; switching it to buffered yields a huge 40% speedup. Doing this also exposes the need to add fi_input.flush() calls, highlighting another performance issue. We may be able to have fewer such calls with some refactoring, but that is a bigger separate change. Just having them highlighted to remind about them as a performance issue is good for now. Signed-off-by: Elijah Newren --- git-filter-repo | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/git-filter-repo b/git-filter-repo index b071b6b..bea4a00 100755 --- a/git-filter-repo +++ b/git-filter-repo @@ -1120,6 +1120,7 @@ class FastExportFilter(object): # buffers filling up so I instead read from it as I go. for change in file_changes: fi_input.write("ls :{} {}\n".format(from_commit, change.filename)) + fi_input.flush() parent_version = fi_output.readline().split() if change.type == 'D': if parent_version == ['missing', change.filename]: @@ -1128,6 +1129,7 @@ class FastExportFilter(object): blob_sha = change.blob_id if isinstance(change.blob_id, int): fi_input.write("get-mark :{}\n".format(change.blob_id)) + fi_input.flush() blob_sha = fi_output.readline().rstrip() if parent_version == [change.mode, 'blob', blob_sha, change.filename]: unnecessary_filechanges.add(change) @@ -1173,6 +1175,7 @@ class FastExportFilter(object): if commit.original_id and fast_import_pipes: fi_input, fi_output = fast_import_pipes fi_input.write("get-mark :{}\n".format(commit.id)) + fi_input.flush() orig_id = commit.original_id new_id = fi_output.readline().rstrip() self._commit_renames[orig_id] = new_id @@ -1531,6 +1534,7 @@ def get_commit_count(repo, *args): if len(args) == 1 and isinstance(args[0], list): args = args[0] p1 = subprocess.Popen(["git", "rev-list"] + args, + bufsize=-1, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=repo) p2 = subprocess.Popen(["wc", "-l"], stdin=p1.stdout, stdout=subprocess.PIPE) @@ -1919,6 +1923,7 @@ def gather_data(args): # Get sizes of blobs by sha1 a='--batch-check=%(objectname) %(objecttype) %(objectsize) %(objectsize:disk)' cf = subprocess.Popen(['git', 'cat-file', '--batch-all-objects', a], + bufsize = -1, stdout = subprocess.PIPE) unpacked_size = {} packed_size = {} @@ -1946,7 +1951,7 @@ def gather_data(args): commit_parse_progress = ProgressWriter() num_commits = 0 cmd = 'git rev-list --topo-order --reverse {} | git diff-tree --stdin --always --root --format="%H%n%P%n%cd" --date=short -M -t -c --raw --combined-all-paths'.format(' '.join(args.refs)) - dtp = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) + dtp = subprocess.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE) f = dtp.stdout line = f.next() cont = bool(line) @@ -2400,7 +2405,7 @@ def run_fast_filter(): '--tag-of-filtered-object=rewrite', '--no-data', '--use-done-feature'] + args.refs - fep = subprocess.Popen(fep_cmd, stdout=subprocess.PIPE) + fep = subprocess.Popen(fep_cmd, bufsize=-1, stdout=subprocess.PIPE) input = fep.stdout if args.dry_run or args.debug: fe_orig = os.path.join(results_tmp_dir, 'fast-export.original') @@ -2414,7 +2419,9 @@ def run_fast_filter(): pipes = None if not args.dry_run: fip_cmd = 'git fast-import --force --quiet'.split() - fip = subprocess.Popen(fip_cmd, stdin=subprocess.PIPE, + fip = subprocess.Popen(fip_cmd, + bufsize=-1, + stdin=subprocess.PIPE, stdout=subprocess.PIPE) pipes = (fip.stdin, fip.stdout) if args.dry_run or args.debug: