From d13f7e91788fd2166cf42af96e98c624ac60c8d2 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Tue, 4 Sep 2018 17:43:06 -0700 Subject: [PATCH] filter-repo: fix detection of merge becoming empty commit In the previous commit, we detected when an entire line of history back to a common ancestor of the merge became empty commits, and avoided having a commit be merged with itself. This commits looks through the changes specified in the commit, which are always specified relative to the first parent, so that if the first parent side was the empty one we can still detect if the merge commit adds no extra changes relative to its remaining parent. Signed-off-by: Elijah Newren --- git-filter-repo | 58 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 53 insertions(+), 5 deletions(-) diff --git a/git-filter-repo b/git-filter-repo index b959efc..c6fcda1 100755 --- a/git-filter-repo +++ b/git-filter-repo @@ -831,7 +831,7 @@ class FastExportFilter(object): if not reset.dumped: reset.dump(self._output) - def _parse_commit(self): + def _parse_commit(self, fast_import_pipes): """ Parse input data into a Commit object. Once the Commit has been created, it will be handed off to the appropriate callbacks. Current-line will @@ -875,6 +875,7 @@ class FastExportFilter(object): # merge a commit with its ancestor. Remove parents that are an # ancestor of another parent.) num_original_parents = len(parents) + check_merge_now_empty = False if num_original_parents > 1: to_remove = [] for cur in xrange(num_original_parents): @@ -884,6 +885,8 @@ class FastExportFilter(object): to_remove.append(cur) for x in reversed(to_remove): parents.pop(x) + if len(parents) == 1: + check_merge_now_empty = True # Record our new parents after above pruning of parents representing # pruned empty histories @@ -900,6 +903,48 @@ class FastExportFilter(object): if self._currentline == '\n': self._advance_currentline() + # If we had a merge commit and the first parent history back to the + # merge base was entirely composed of commits made empty by our + # filtering, it is likely that this merge commit is empty and can be + # pruned too. Check by comparing the contents of this merge to its + # remaining parent. + # + # NOTES on why/how this works: + # 1. fast-export always gives file changes in a merge commit relative + # to the first parent. + # 2. The only way this 'if' is active is when the first parent was + # an ancestor of what is now the only remaining parent + # 3. The two above imply that the file changes we're looking at are + # just for the line of history for the remaining parent, and show + # all changes needed to make the original first parent (whose tree + # matched an ancestor of the remaining parent) match the merge's tree. + # 4. If the versions of all specified files in the remaining parent + # match the file change versions, then this "merge" commit is + # actually going to be an empty non-merge commit and we should prune + # it. + if check_merge_now_empty and fast_import_pipes: + unnecessary_filechanges = set() + fi_input, fi_output = fast_import_pipes + # Optimization note: we could have two loops over file_changes, the + # first doing all the fi_input.write() calls, and the second doing the + # rest. But I'm worried about fast-import blocking on fi_output + # buffers filling up so I instead read from it as I go. + for change in file_changes: + fi_input.write("ls :{} {}\n".format(from_commit, change.filename)) + parent_version = fi_output.readline().split() + if change.type == 'D': + if parent_version == ['missing', change.filename]: + unnecessary_filechanges.add(change) + else: + blob_sha = change.blob_id + if isinstance(change.blob_id, int): + fi_input.write("get-mark :{}\n".format(change.blob_id)) + blob_sha = fi_output.readline().rstrip() + if parent_version == [change.mode, 'blob', blob_sha, change.filename]: + unnecessary_filechanges.add(change) + file_changes = [change for change in file_changes + if change not in unnecessary_filechanges] + # Okay, now we can finally create the Commit object commit = Commit(branch, author_name, author_email, author_date, @@ -1051,7 +1096,7 @@ class FastExportFilter(object): def get_seen_refs(self): return self._seen_refs.keys() - def run(self, *args): + def run(self, *args, **kwargs): """ This method performs the filter. The method optionally takes two arguments. The first represents the source repository (either a file object @@ -1110,7 +1155,7 @@ class FastExportFilter(object): elif self._currentline.startswith('reset'): self._parse_reset() elif self._currentline.startswith('commit'): - self._parse_commit() + self._parse_commit(kwargs.get('fast_import_pipes', None)) elif self._currentline.startswith('tag'): self._parse_tag() elif self._currentline.startswith('progress'): @@ -1473,9 +1518,12 @@ def run_fast_filter(): print(" (saving a copy of the output at {})".format(fe_orig)) # Determine where to send output + pipes = None if not args.dry_run: fip_cmd = 'git fast-import --force --quiet'.split() - fip = subprocess.Popen(fip_cmd, stdin=subprocess.PIPE) + fip = subprocess.Popen(fip_cmd, stdin=subprocess.PIPE, + stdout=subprocess.PIPE) + pipes = (fip.stdin, fip.stdout) if args.dry_run or args.debug: fe_filt = os.path.join(results_tmp_dir, 'fast-export.filtered') output = open(fe_filt, 'w') @@ -1490,7 +1538,7 @@ def run_fast_filter(): filter = FastExportFilter( commit_callback = lambda c : tweak_commit(args, c), ) - filter.run(input, output) + filter.run(input, output, fast_import_pipes = pipes) # Close the output, ensure fast-export and fast-import have completed output.close()