filter-repo: add automatic rewriting of commit hashes in commit messages

Commit messages often refer to past commits; while rewriting commits we
would also like to update these commit messages to refer to the new
commit names.

In the case that a commit message references another commit which was
dropped by the filtering process, we have no way to rewrite the commit
message to reference a valid commit hash.  Instead of dying, note the
suboptimal commit in the suboptimal-issues file.

Signed-off-by: Elijah Newren <newren@gmail.com>
This commit is contained in:
Elijah Newren 2018-09-13 18:02:41 -07:00
parent f95308c5eb
commit af081d0fce

View File

@ -665,6 +665,26 @@ class FastExportFilter(object):
# next into master").
self._commits_no_longer_merges = []
# A dict of original_ids to new_ids; filtering commits means getting
# new commit hash (sha1sums), and we record the mapping both for
# diagnostic purposes and so we can rewrite commit messages. Note that
# the new_id can be None rather than a commit hash if the original
# commit became empty and was pruned or was otherwise dropped.
self._commit_renames = {}
# A dict of commit_hash[1:7] -> set(commit_hashes with that prefix).
#
# It's common for commit messages to refer to commits by abbreviated
# commit hashes, as short as 7 characters. To facilitate translating
# such short hashes, we have a mapping of prefixes to full old hashes.
self._commit_short_old_hashes = collections.defaultdict(set)
# A set of commit hash references appearing in commit messages which
# mapped to a valid commit that was removed entirely in the filtering
# process. The commit message will continue to reference the
# now-missing commit hash, since there was nothing to map it to.
self._commits_referenced_but_removed = set()
# A handle to the input source for the fast-export data
self._input = None
@ -867,6 +887,26 @@ class FastExportFilter(object):
if not reset.dumped:
reset.dump(self._output)
def _translate_commit_hash(self, matchobj):
old_hash = matchobj.group(1)
orig_len = len(old_hash)
if old_hash not in self._commit_renames:
if old_hash[0:7] not in self._commit_short_old_hashes:
return old_hash
possibilities = self._commit_short_old_hashes[old_hash[0:7]]
matches = [x for x in possibilities
if x[0:orig_len] == old_hash]
if len(matches) != 1:
return old_hash
old_hash = matches[0]
new_hash = self._commit_renames[old_hash]
if new_hash is None:
self._commits_referenced_but_removed.add(old_hash)
return old_hash[0:orig_len]
else:
return new_hash[0:orig_len]
def _parse_commit(self, fast_import_pipes):
"""
Parse input data into a Commit object. Once the Commit has been created,
@ -897,6 +937,9 @@ class FastExportFilter(object):
(committer_name, committer_email, committer_date)
commit_msg = self._parse_data()
commit_msg = re.sub(r'(\b[0-9a-f]{7,40}\b)',
self._translate_commit_hash,
commit_msg)
parents = []
parents.append(self._parse_optional_parent_ref('from'))
@ -1019,12 +1062,14 @@ class FastExportFilter(object):
(not had_file_changes and len(parents) >= 1)):
commit.dump(self._output)
new_id = None
# Determine the mapping of old commit hash to new one
# Record the mapping of old commit hash to new one
if commit.original_id and fast_import_pipes:
fi_input, fi_output = fast_import_pipes
fi_input.write("get-mark :{}\n".format(commit.id))
orig_id = commit.original_id
new_id = fi_output.readline().rstrip()
self._commit_renames[orig_id] = new_id
self._commit_short_old_hashes[orig_id[0:7]].add(orig_id)
# Now, record if this was a merge commit that turned into a non-merge
# commit.
if num_original_parents > 1 and not merge_commit:
@ -1034,6 +1079,7 @@ class FastExportFilter(object):
# lose any refs this way.
self._seen_refs[branch] = commit.first_parent()
commit.skip(commit.first_parent())
self._commit_renames[commit.original_id] = None
def _parse_tag(self):
"""
@ -1201,6 +1247,17 @@ class FastExportFilter(object):
f.write(' {} {}\n'.format(oldhash, newhash))
f.write('\n')
if self._commits_referenced_but_removed:
issues_found = True
f.write(textwrap.dedent('''
The following commits were filtered out, but referenced in another
commit message. The reference to the now-nonexistent commit hash
(or a substring thereof) was left as-is in any commit messages:
'''[1:]))
for bad_commit_reference in self._commits_referenced_but_removed:
f.write(' {}\n'.format(bad_commit_reference))
f.write('\n')
if not issues_found:
f.write("No filtering problems encountered.")