From af081d0fcedc1c9916485830dff95bc98238ce3c Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Thu, 13 Sep 2018 18:02:41 -0700 Subject: [PATCH] filter-repo: add automatic rewriting of commit hashes in commit messages Commit messages often refer to past commits; while rewriting commits we would also like to update these commit messages to refer to the new commit names. In the case that a commit message references another commit which was dropped by the filtering process, we have no way to rewrite the commit message to reference a valid commit hash. Instead of dying, note the suboptimal commit in the suboptimal-issues file. Signed-off-by: Elijah Newren --- git-filter-repo | 59 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/git-filter-repo b/git-filter-repo index 9947b19..c43eac7 100755 --- a/git-filter-repo +++ b/git-filter-repo @@ -665,6 +665,26 @@ class FastExportFilter(object): # next into master"). self._commits_no_longer_merges = [] + # A dict of original_ids to new_ids; filtering commits means getting + # new commit hash (sha1sums), and we record the mapping both for + # diagnostic purposes and so we can rewrite commit messages. Note that + # the new_id can be None rather than a commit hash if the original + # commit became empty and was pruned or was otherwise dropped. + self._commit_renames = {} + + # A dict of commit_hash[1:7] -> set(commit_hashes with that prefix). + # + # It's common for commit messages to refer to commits by abbreviated + # commit hashes, as short as 7 characters. To facilitate translating + # such short hashes, we have a mapping of prefixes to full old hashes. + self._commit_short_old_hashes = collections.defaultdict(set) + + # A set of commit hash references appearing in commit messages which + # mapped to a valid commit that was removed entirely in the filtering + # process. The commit message will continue to reference the + # now-missing commit hash, since there was nothing to map it to. + self._commits_referenced_but_removed = set() + # A handle to the input source for the fast-export data self._input = None @@ -867,6 +887,26 @@ class FastExportFilter(object): if not reset.dumped: reset.dump(self._output) + def _translate_commit_hash(self, matchobj): + old_hash = matchobj.group(1) + orig_len = len(old_hash) + if old_hash not in self._commit_renames: + if old_hash[0:7] not in self._commit_short_old_hashes: + return old_hash + possibilities = self._commit_short_old_hashes[old_hash[0:7]] + matches = [x for x in possibilities + if x[0:orig_len] == old_hash] + if len(matches) != 1: + return old_hash + old_hash = matches[0] + + new_hash = self._commit_renames[old_hash] + if new_hash is None: + self._commits_referenced_but_removed.add(old_hash) + return old_hash[0:orig_len] + else: + return new_hash[0:orig_len] + def _parse_commit(self, fast_import_pipes): """ Parse input data into a Commit object. Once the Commit has been created, @@ -897,6 +937,9 @@ class FastExportFilter(object): (committer_name, committer_email, committer_date) commit_msg = self._parse_data() + commit_msg = re.sub(r'(\b[0-9a-f]{7,40}\b)', + self._translate_commit_hash, + commit_msg) parents = [] parents.append(self._parse_optional_parent_ref('from')) @@ -1019,12 +1062,14 @@ class FastExportFilter(object): (not had_file_changes and len(parents) >= 1)): commit.dump(self._output) new_id = None - # Determine the mapping of old commit hash to new one + # Record the mapping of old commit hash to new one if commit.original_id and fast_import_pipes: fi_input, fi_output = fast_import_pipes fi_input.write("get-mark :{}\n".format(commit.id)) orig_id = commit.original_id new_id = fi_output.readline().rstrip() + self._commit_renames[orig_id] = new_id + self._commit_short_old_hashes[orig_id[0:7]].add(orig_id) # Now, record if this was a merge commit that turned into a non-merge # commit. if num_original_parents > 1 and not merge_commit: @@ -1034,6 +1079,7 @@ class FastExportFilter(object): # lose any refs this way. self._seen_refs[branch] = commit.first_parent() commit.skip(commit.first_parent()) + self._commit_renames[commit.original_id] = None def _parse_tag(self): """ @@ -1201,6 +1247,17 @@ class FastExportFilter(object): f.write(' {} {}\n'.format(oldhash, newhash)) f.write('\n') + if self._commits_referenced_but_removed: + issues_found = True + f.write(textwrap.dedent(''' + The following commits were filtered out, but referenced in another + commit message. The reference to the now-nonexistent commit hash + (or a substring thereof) was left as-is in any commit messages: + '''[1:])) + for bad_commit_reference in self._commits_referenced_but_removed: + f.write(' {}\n'.format(bad_commit_reference)) + f.write('\n') + if not issues_found: f.write("No filtering problems encountered.")