mirror of
https://github.com/newren/git-filter-repo.git
synced 2024-07-06 18:32:14 +02:00
4f149daacc
Signed-off-by: Elijah Newren <newren@gmail.com>
2218 lines
80 KiB
Python
Executable File
2218 lines
80 KiB
Python
Executable File
#!/usr/bin/env python
|
|
|
|
"""
|
|
Simple program for filtering git repositories, similar to git filter-branch,
|
|
BFG repo cleaner, and others. The basic idea is that it works by running
|
|
git fast-export <options> | filter | git fast-import <options>
|
|
where this program not only launches the whole pipeline but also serves as
|
|
the 'filter' in the middle. It does a few additional things on top as well
|
|
in order to make it into a well-rounded filtering tool.
|
|
"""
|
|
|
|
from __future__ import print_function
|
|
|
|
import argparse
|
|
import collections
|
|
import fnmatch
|
|
import os
|
|
import re
|
|
import StringIO
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
import textwrap
|
|
|
|
from email.Utils import unquote
|
|
from datetime import tzinfo, timedelta, datetime
|
|
|
|
__all__ = ["Blob", "Reset", "FileChanges", "Commit", "Tag", "Progress",
|
|
"Checkpoint", "FastExportFilter", "FixedTimeZone",
|
|
"fast_export_output", "fast_import_input", "get_commit_count",
|
|
"get_total_objects", "record_id_rename"]
|
|
|
|
|
|
def _timedelta_to_seconds(delta):
|
|
"""
|
|
Converts timedelta to seconds
|
|
"""
|
|
offset = delta.days*86400 + delta.seconds + (delta.microseconds+0.0)/1000000
|
|
return round(offset)
|
|
|
|
def _write_date(file_, date):
|
|
"""
|
|
Writes a date to a file. The file should already be open. The date is
|
|
written as seconds-since-epoch followed by the name of the timezone.
|
|
"""
|
|
epoch = datetime.fromtimestamp(0, date.tzinfo)
|
|
file_.write('%d %s' % (_timedelta_to_seconds(date - epoch),
|
|
date.tzinfo.tzname(0)))
|
|
|
|
class FixedTimeZone(tzinfo):
|
|
"""
|
|
Fixed offset in minutes east from UTC.
|
|
"""
|
|
|
|
def __init__(self, offset_string):
|
|
tzinfo.__init__(self)
|
|
minus, hh, mm = re.match(r'^([-+]?)(\d\d)(\d\d)$', offset_string).groups()
|
|
sign = minus and -1 or 1
|
|
self._offset = timedelta(minutes = sign*(60*int(hh) + int(mm)))
|
|
self._offset_string = offset_string
|
|
|
|
def utcoffset(self, dt):
|
|
return self._offset
|
|
|
|
def tzname(self, dt):
|
|
return self._offset_string
|
|
|
|
def dst(self, dt):
|
|
return timedelta(0)
|
|
|
|
class AncestryGraph(object):
|
|
"""
|
|
A class that maintains a direct acycle graph of commits for the purpose of
|
|
determining if one commit is the ancestor of another.
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.cur_value = 0
|
|
|
|
# A mapping from the external identifers given to us to the simple integers
|
|
# we use in self.graph
|
|
self.value = {}
|
|
|
|
# A tuple of (depth, list-of-ancestors). Values and keys in this graph are
|
|
# all integers from the self.value dict. The depth of a commit is one more
|
|
# than the max depth of any of its ancestors.
|
|
self.graph = {}
|
|
|
|
def add_commit_and_parents(self, commit, parents):
|
|
"""
|
|
Record in graph that commit has the given parents. parents _MUST_ have
|
|
been first recorded. commit _MUST_ not have been recorded yet.
|
|
"""
|
|
assert all(p in self.value for p in parents)
|
|
assert commit not in self.value
|
|
|
|
# Get values for commit and parents
|
|
self.cur_value += 1
|
|
self.value[commit] = self.cur_value
|
|
graph_parents = [self.value[x] for x in parents]
|
|
|
|
# Determine depth for commit, then insert the info into the graph
|
|
depth = 1
|
|
if parents:
|
|
depth += max(self.graph[p][0] for p in graph_parents)
|
|
self.graph[self.cur_value] = (depth, graph_parents)
|
|
|
|
def is_ancestor(self, possible_ancestor, check):
|
|
"""
|
|
Return whether possible_ancestor is an ancestor of check
|
|
"""
|
|
a, b = self.value[possible_ancestor], self.value[check]
|
|
a_depth = self.graph[a][0]
|
|
ancestors = [b]
|
|
visited = set()
|
|
while ancestors:
|
|
ancestor = ancestors.pop()
|
|
if ancestor in visited:
|
|
continue
|
|
visited.add(ancestor)
|
|
depth, more_ancestors = self.graph[ancestor]
|
|
if ancestor == a:
|
|
return True
|
|
elif depth <= a_depth:
|
|
continue
|
|
ancestors.extend(more_ancestors)
|
|
return False
|
|
|
|
|
|
class _IDs(object):
|
|
"""
|
|
A class that maintains the 'name domain' of all the 'marks' (short int
|
|
id for a blob/commit git object). The reason this mechanism is necessary
|
|
is because the text of fast-export may refer to an object using a different
|
|
mark than the mark that was assigned to that object using IDS.new(). This
|
|
class allows you to translate the fast-export marks (old) to the marks
|
|
assigned from IDS.new() (new).
|
|
|
|
Note that there are two reasons why the marks may differ: (1) The
|
|
user manually creates Blob or Commit objects (for insertion into the
|
|
stream) (2) We're reading the data from two different repositories
|
|
and trying to combine the data (git fast-export will number ids from
|
|
1...n, and having two 1's, two 2's, two 3's, causes issues).
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""
|
|
Init
|
|
"""
|
|
# The id for the next created blob/commit object
|
|
self._next_id = 1
|
|
|
|
# A map of old-ids to new-ids (1:1 map)
|
|
self._translation = {}
|
|
|
|
# A map of new-ids to every old-id that points to the new-id (1:N map)
|
|
self._reverse_translation = {}
|
|
|
|
def new(self):
|
|
"""
|
|
Should be called whenever a new blob or commit object is created. The
|
|
returned value should be used as the id/mark for that object.
|
|
"""
|
|
rv = self._next_id
|
|
self._next_id += 1
|
|
return rv
|
|
|
|
def record_rename(self, old_id, new_id, handle_transitivity = False):
|
|
"""
|
|
Record that old_id is being renamed to new_id.
|
|
"""
|
|
if old_id != new_id:
|
|
# old_id -> new_id
|
|
self._translation[old_id] = new_id
|
|
|
|
# Transitivity will be needed if new commits are being inserted mid-way
|
|
# through a branch.
|
|
if handle_transitivity:
|
|
# Anything that points to old_id should point to new_id
|
|
if old_id in self._reverse_translation:
|
|
for id_ in self._reverse_translation[old_id]:
|
|
self._translation[id_] = new_id
|
|
|
|
# Record that new_id is pointed to by old_id
|
|
if new_id not in self._reverse_translation:
|
|
self._reverse_translation[new_id] = []
|
|
self._reverse_translation[new_id].append(old_id)
|
|
|
|
def translate(self, old_id):
|
|
"""
|
|
If old_id has been mapped to an alternate id, return the alternate id.
|
|
"""
|
|
if old_id in self._translation:
|
|
return self._translation[old_id]
|
|
else:
|
|
return old_id
|
|
|
|
def __str__(self):
|
|
"""
|
|
Convert IDs to string; used for debugging
|
|
"""
|
|
rv = "Current count: %d\nTranslation:\n" % self._next_id
|
|
for k in sorted(self._translation):
|
|
rv += " %d -> %d\n" % (k, self._translation[k])
|
|
|
|
rv += "Reverse translation:\n"
|
|
for k in sorted(self._reverse_translation):
|
|
rv += " " + str(k) + " -> " + str(self._reverse_translation[k]) + "\n"
|
|
|
|
return rv
|
|
|
|
def _avoid_ids_below(self, skip_value):
|
|
"""
|
|
Make sure that we don't use ids <= skip_value
|
|
"""
|
|
self._next_id = max(self._next_id, skip_value + 1)
|
|
|
|
class _GitElement(object):
|
|
"""
|
|
The base class for all git elements that we create.
|
|
"""
|
|
|
|
def __init__(self):
|
|
# A string that describes what type of Git element this is
|
|
self.type = None
|
|
|
|
# A flag telling us if this Git element has been dumped
|
|
# (i.e. printed) or skipped. Typically elements that have been
|
|
# dumped or skipped will not be dumped again.
|
|
self.dumped = 0
|
|
|
|
def dump(self, file_):
|
|
"""
|
|
This version should never be called. Derived classes need to
|
|
override! We should note that subclasses should implement this
|
|
method such that the output would match the format produced by
|
|
fast-export.
|
|
"""
|
|
raise SystemExit("Unimplemented function: %s.dump()" % type(self).__name__)
|
|
|
|
def __str__(self):
|
|
"""
|
|
Convert GitElement to string; used for debugging
|
|
"""
|
|
old_dumped = self.dumped
|
|
writeme = StringIO.StringIO()
|
|
self.dump(writeme)
|
|
output_lines = writeme.getvalue().splitlines()
|
|
writeme.close()
|
|
self.dumped = old_dumped
|
|
return "{}:\n {}".format(type(self).__name__, "\n ".join(output_lines))
|
|
|
|
def skip(self, new_id=None):
|
|
"""
|
|
Ensures this element will not be written to output
|
|
"""
|
|
self.dumped = 2
|
|
|
|
class _GitElementWithId(_GitElement):
|
|
"""
|
|
The base class for Git elements that have IDs (commits and blobs)
|
|
"""
|
|
|
|
def __init__(self):
|
|
_GitElement.__init__(self)
|
|
|
|
# The mark (short, portable id) for this element
|
|
self.id = _IDS.new()
|
|
|
|
# The previous mark for this element
|
|
self.old_id = None
|
|
|
|
def skip(self, new_id=None):
|
|
"""
|
|
This element will no longer be automatically written to output. When a
|
|
commit gets skipped, it's ID will need to be translated to that of its
|
|
parent.
|
|
"""
|
|
self.dumped = 2
|
|
|
|
_IDS.record_rename(self.old_id or self.id, new_id)
|
|
|
|
class Blob(_GitElementWithId):
|
|
"""
|
|
This class defines our representation of git blob elements (i.e. our
|
|
way of representing file contents).
|
|
"""
|
|
|
|
def __init__(self, data, original_id = None):
|
|
_GitElementWithId.__init__(self)
|
|
|
|
# Denote that this is a blob
|
|
self.type = 'blob'
|
|
|
|
# Record original id
|
|
self.original_id = original_id
|
|
|
|
# Stores the blob's data
|
|
self.data = data
|
|
|
|
def dump(self, file_):
|
|
"""
|
|
Write this blob element to a file.
|
|
"""
|
|
self.dumped = 1
|
|
|
|
file_.write('blob\n')
|
|
file_.write('mark :%d\n' % self.id)
|
|
file_.write('data %d\n%s' % (len(self.data), self.data))
|
|
file_.write('\n')
|
|
|
|
|
|
class Reset(_GitElement):
|
|
"""
|
|
This class defines our representation of git reset elements. A reset
|
|
event is the creation (or recreation) of a named branch, optionally
|
|
starting from a specific revision).
|
|
"""
|
|
|
|
def __init__(self, ref, from_ref = None):
|
|
_GitElement.__init__(self)
|
|
|
|
# Denote that this is a reset
|
|
self.type = 'reset'
|
|
|
|
# The name of the branch being (re)created
|
|
self.ref = ref
|
|
|
|
# Some reference to the branch/commit we are resetting from
|
|
self.from_ref = from_ref
|
|
|
|
def dump(self, file_):
|
|
"""
|
|
Write this reset element to a file
|
|
"""
|
|
self.dumped = 1
|
|
|
|
file_.write('reset %s\n' % self.ref)
|
|
if self.from_ref:
|
|
file_.write('from :%d\n' % self.from_ref)
|
|
file_.write('\n')
|
|
|
|
class FileChanges(_GitElement):
|
|
"""
|
|
This class defines our representation of file change elements. File change
|
|
elements are components within a Commit element.
|
|
"""
|
|
|
|
def __init__(self, type_, filename, id_ = None, mode = None):
|
|
_GitElement.__init__(self)
|
|
|
|
# Denote the type of file-change (M for modify, D for delete, etc)
|
|
self.type = type_
|
|
|
|
# Record the name of the file being changed
|
|
self.filename = filename
|
|
|
|
# Record the mode (mode describes type of file entry (non-executable,
|
|
# executable, or symlink)).
|
|
self.mode = None
|
|
|
|
# blob_id is the id (mark) of the affected blob
|
|
self.blob_id = None
|
|
|
|
# For 'M' file changes (modify), expect to have id and mode
|
|
if type_ == 'M':
|
|
if mode is None:
|
|
raise SystemExit("file mode and idnum needed for %s" % filename)
|
|
self.mode = mode
|
|
self.blob_id = id_
|
|
|
|
# For 'R' file changes (rename), expect to have newname as third arg
|
|
elif type_ == 'R':
|
|
if id_ is None:
|
|
raise SystemExit("new name needed for rename of %s" % filename)
|
|
self.filename = (self.filename, id_)
|
|
|
|
def dump(self, file_):
|
|
"""
|
|
Write this file-change element to a file
|
|
"""
|
|
skipped_blob = (self.type == 'M' and self.blob_id is None)
|
|
if skipped_blob: return
|
|
self.dumped = 1
|
|
|
|
if self.type == 'M' and isinstance(self.blob_id, int):
|
|
file_.write('M %s :%d %s\n' % (self.mode, self.blob_id, self.filename))
|
|
elif self.type == 'M':
|
|
file_.write('M %s %s %s\n' % (self.mode, self.blob_id, self.filename))
|
|
elif self.type == 'D':
|
|
file_.write('D %s\n' % self.filename)
|
|
else:
|
|
raise SystemExit("Unhandled filechange type: %s" % self.type)
|
|
|
|
class Commit(_GitElementWithId):
|
|
"""
|
|
This class defines our representation of commit elements. Commit elements
|
|
contain all the information associated with a commit.
|
|
"""
|
|
|
|
def __init__(self, branch,
|
|
author_name, author_email, author_date,
|
|
committer_name, committer_email, committer_date,
|
|
message,
|
|
file_changes,
|
|
from_commit = None,
|
|
merge_commits = [],
|
|
original_id = None,
|
|
**kwargs):
|
|
_GitElementWithId.__init__(self)
|
|
|
|
# Denote that this is a commit element
|
|
self.type = 'commit'
|
|
|
|
# Record the affected branch
|
|
self.branch = branch
|
|
|
|
# Record original id
|
|
self.original_id = original_id
|
|
|
|
# Record author's name
|
|
self.author_name = author_name
|
|
|
|
# Record author's email
|
|
self.author_email = author_email
|
|
|
|
# Record date of authoring
|
|
self.author_date = author_date
|
|
|
|
# Record committer's name
|
|
self.committer_name = committer_name
|
|
|
|
# Record committer's email
|
|
self.committer_email = committer_email
|
|
|
|
# Record date the commit was made
|
|
self.committer_date = committer_date
|
|
|
|
# Record commit message
|
|
self.message = message
|
|
|
|
# List of file-changes associated with this commit. Note that file-changes
|
|
# are also represented as git elements
|
|
self.file_changes = file_changes
|
|
|
|
# Record the commit to initialize this branch from. This revision will be
|
|
# the first parent of the new commit
|
|
self.from_commit = from_commit
|
|
|
|
# Record additional parent commits
|
|
self.merge_commits = merge_commits
|
|
|
|
# Member below is necessary for workaround fast-import's/fast-export's
|
|
# weird handling of merges.
|
|
self.stream_number = 0
|
|
if "stream_number" in kwargs:
|
|
self.stream_number = kwargs["stream_number"]
|
|
|
|
def dump(self, file_):
|
|
"""
|
|
Write this commit element to a file.
|
|
"""
|
|
self.dumped = 1
|
|
|
|
# Workaround fast-import/fast-export weird handling of merges
|
|
if self.stream_number != _CURRENT_STREAM_NUMBER:
|
|
_EXTRA_CHANGES[self.id] = [[change for change in self.file_changes]]
|
|
|
|
merge_extra_changes = []
|
|
for parent in self.merge_commits:
|
|
if parent in _EXTRA_CHANGES:
|
|
merge_extra_changes += _EXTRA_CHANGES[parent]
|
|
|
|
for additional_changes in merge_extra_changes:
|
|
self.file_changes += additional_changes
|
|
|
|
if self.stream_number == _CURRENT_STREAM_NUMBER:
|
|
parent_extra_changes = []
|
|
if self.from_commit and self.from_commit in _EXTRA_CHANGES:
|
|
parent_extra_changes = _EXTRA_CHANGES[self.from_commit]
|
|
parent_extra_changes += merge_extra_changes
|
|
_EXTRA_CHANGES[self.id] = parent_extra_changes
|
|
# End workaround
|
|
|
|
file_.write('commit %s\n' % self.branch)
|
|
file_.write('mark :%d\n' % self.id)
|
|
file_.write('author %s <%s> ' % (self.author_name, self.author_email))
|
|
_write_date(file_, self.author_date)
|
|
file_.write('\n')
|
|
file_.write('committer %s <%s> ' % \
|
|
(self.committer_name, self.committer_email))
|
|
_write_date(file_, self.committer_date)
|
|
file_.write('\n')
|
|
file_.write('data %d\n%s' % (len(self.message), self.message))
|
|
if self.from_commit:
|
|
mark = ':' if isinstance(self.from_commit, int) else ''
|
|
file_.write('from {}{}\n'.format(mark, self.from_commit))
|
|
for ref in self.merge_commits:
|
|
mark = ':' if isinstance(ref, int) else ''
|
|
file_.write('merge {}{}\n'.format(mark, ref))
|
|
for change in self.file_changes:
|
|
change.dump(file_)
|
|
file_.write('\n')
|
|
|
|
def get_parents(self):
|
|
"""
|
|
Return all parent commits
|
|
"""
|
|
my_parents = []
|
|
if self.from_commit:
|
|
my_parents.append(self.from_commit)
|
|
my_parents += self.merge_commits
|
|
return my_parents
|
|
|
|
def first_parent(self):
|
|
"""
|
|
Return first parent commit
|
|
"""
|
|
my_parents = self.get_parents()
|
|
if my_parents:
|
|
return my_parents[0]
|
|
return None
|
|
|
|
class Tag(_GitElement):
|
|
"""
|
|
This class defines our representation of annotated tag elements.
|
|
"""
|
|
|
|
def __init__(self, ref, from_ref,
|
|
tagger_name, tagger_email, tagger_date, tag_msg,
|
|
original_id = None):
|
|
_GitElement.__init__(self)
|
|
|
|
# Denote that this is a tag element
|
|
self.type = 'tag'
|
|
|
|
# Store the name of the tag
|
|
self.ref = ref
|
|
|
|
# Store the entity being tagged (this should be a commit)
|
|
self.from_ref = from_ref
|
|
|
|
# Record original id
|
|
self.original_id = original_id
|
|
|
|
# Store the name of the tagger
|
|
self.tagger_name = tagger_name
|
|
|
|
# Store the email of the tagger
|
|
self.tagger_email = tagger_email
|
|
|
|
# Store the date
|
|
self.tagger_date = tagger_date
|
|
|
|
# Store the tag message
|
|
self.tag_message = tag_msg
|
|
|
|
def dump(self, file_):
|
|
"""
|
|
Write this tag element to a file
|
|
"""
|
|
|
|
self.dumped = 1
|
|
|
|
file_.write('tag %s\n' % self.ref)
|
|
mark = ':' if isinstance(self.from_ref, int) else ''
|
|
file_.write('from {}{}\n'.format(mark, self.from_ref))
|
|
file_.write('tagger %s <%s> ' % (self.tagger_name, self.tagger_email))
|
|
_write_date(file_, self.tagger_date)
|
|
file_.write('\n')
|
|
file_.write('data %d\n%s' % (len(self.tag_message), self.tag_message))
|
|
file_.write('\n')
|
|
|
|
class Progress(_GitElement):
|
|
"""
|
|
This class defines our representation of progress elements. The progress
|
|
element only contains a progress message, which is printed by fast-import
|
|
when it processes the progress output.
|
|
"""
|
|
|
|
def __init__(self, message):
|
|
_GitElement.__init__(self)
|
|
|
|
# Denote that this is a progress element
|
|
self.type = 'progress'
|
|
|
|
# Store the progress message
|
|
self.message = message
|
|
|
|
def dump(self, file_):
|
|
"""
|
|
Write this progress element to a file
|
|
"""
|
|
self.dumped = 1
|
|
|
|
file_.write('progress %s\n' % self.message)
|
|
#file_.write('\n')
|
|
|
|
class Checkpoint(_GitElement):
|
|
"""
|
|
This class defines our representation of checkpoint elements. These
|
|
elements represent events which force fast-import to close the current
|
|
packfile, start a new one, and to save out all current branch refs, tags
|
|
and marks.
|
|
"""
|
|
|
|
def __init__(self):
|
|
_GitElement.__init__(self)
|
|
|
|
# Denote that this is a checkpoint element
|
|
self.type = 'checkpoint'
|
|
|
|
def dump(self, file_):
|
|
"""
|
|
Write this checkpoint element to a file
|
|
"""
|
|
self.dumped = 1
|
|
|
|
file_.write('checkpoint\n')
|
|
file_.write('\n')
|
|
|
|
class LiteralCommand(_GitElement):
|
|
"""
|
|
This class defines our representation of commands. The literal command
|
|
includes only a single line, and is not processed in any special way.
|
|
"""
|
|
|
|
def __init__(self, line):
|
|
_GitElement.__init__(self)
|
|
|
|
# Denote that this is a literal element
|
|
self.type = 'literal'
|
|
|
|
# Store the command
|
|
self.line = line
|
|
|
|
def dump(self, file_):
|
|
"""
|
|
Write this progress element to a file
|
|
"""
|
|
self.dumped = 1
|
|
|
|
file_.write(self.line)
|
|
|
|
class FastExportFilter(object):
|
|
"""
|
|
A class for parsing and handling the output from fast-export. This
|
|
class allows the user to register callbacks when various types of
|
|
data are encountered in the fast-export output. The basic idea is that,
|
|
FastExportFilter takes fast-export output, creates the various objects
|
|
as it encounters them, the user gets to use/modify these objects via
|
|
callbacks, and finally FastExportFilter outputs the modified objects
|
|
in fast-import format (presumably so they can be used to create a new
|
|
repo).
|
|
"""
|
|
|
|
def __init__(self,
|
|
tag_callback = None, commit_callback = None,
|
|
blob_callback = None, progress_callback = None,
|
|
reset_callback = None, checkpoint_callback = None,
|
|
everything_callback = None):
|
|
# Members below simply store callback functions for the various git
|
|
# elements
|
|
self._tag_callback = tag_callback
|
|
self._blob_callback = blob_callback
|
|
self._reset_callback = reset_callback
|
|
self._commit_callback = commit_callback
|
|
self._progress_callback = progress_callback
|
|
self._checkpoint_callback = checkpoint_callback
|
|
self._everything_callback = everything_callback
|
|
|
|
# A list of all the refs we've seen, plus any mark we need to set them
|
|
# to if the last (or even only) commit on that branch was pruned
|
|
self._seen_refs = {}
|
|
|
|
# A tuple of (depth, list-of-ancestors). Commits and ancestors are
|
|
# identified by their id (their 'mark' in fast-export or fast-import
|
|
# speak). The depth of a commit is one more than the max depth of any
|
|
# of its ancestors.
|
|
self._graph = AncestryGraph()
|
|
|
|
# A set of commit hash pairs (oldhash, newhash) which used to be merge
|
|
# commits but due to filtering were turned into non-merge commits.
|
|
# The commits probably have suboptimal commit messages (e.g. "Merge branch
|
|
# next into master").
|
|
self._commits_no_longer_merges = []
|
|
|
|
# A dict of original_ids to new_ids; filtering commits means getting
|
|
# new commit hash (sha1sums), and we record the mapping both for
|
|
# diagnostic purposes and so we can rewrite commit messages. Note that
|
|
# the new_id can be None rather than a commit hash if the original
|
|
# commit became empty and was pruned or was otherwise dropped.
|
|
self._commit_renames = {}
|
|
|
|
# A dict of commit_hash[1:7] -> set(commit_hashes with that prefix).
|
|
#
|
|
# It's common for commit messages to refer to commits by abbreviated
|
|
# commit hashes, as short as 7 characters. To facilitate translating
|
|
# such short hashes, we have a mapping of prefixes to full old hashes.
|
|
self._commit_short_old_hashes = collections.defaultdict(set)
|
|
|
|
# A set of commit hash references appearing in commit messages which
|
|
# mapped to a valid commit that was removed entirely in the filtering
|
|
# process. The commit message will continue to reference the
|
|
# now-missing commit hash, since there was nothing to map it to.
|
|
self._commits_referenced_but_removed = set()
|
|
|
|
# A handle to the input source for the fast-export data
|
|
self._input = None
|
|
|
|
# A handle to the output file for the output we generate (we call dump
|
|
# on many of the git elements we create).
|
|
self._output = None
|
|
|
|
# Stores the contents of the current line of input being parsed
|
|
self._currentline = ''
|
|
|
|
# Stores a translation of ids, useful when reading the output of a second
|
|
# or third (or etc.) git fast-export output stream
|
|
self._id_offset = 0
|
|
|
|
# Progress handling (number of commits parsed, etc.)
|
|
self._num_commits = 0
|
|
self._quiet = False
|
|
self._last_progress_update = 0 # seconds since Epoch; arbitrary old date
|
|
|
|
# Whether we've run our post-processing extra commands
|
|
self._finalize_handled = False
|
|
|
|
def _advance_currentline(self):
|
|
"""
|
|
Grab the next line of input
|
|
"""
|
|
self._currentline = self._input.readline()
|
|
|
|
def _parse_optional_mark(self):
|
|
"""
|
|
If the current line contains a mark, parse it and advance to the
|
|
next line; return None otherwise
|
|
"""
|
|
mark = None
|
|
matches = re.match('mark :(\d+)\n$', self._currentline)
|
|
if matches:
|
|
mark = int(matches.group(1))+self._id_offset
|
|
self._advance_currentline()
|
|
return mark
|
|
|
|
def _parse_optional_parent_ref(self, refname):
|
|
"""
|
|
If the current line contains a reference to a parent commit, then
|
|
parse it and advance the current line; otherwise return None. Note
|
|
that the name of the reference ('from', 'merge') must match the
|
|
refname arg.
|
|
"""
|
|
baseref = None
|
|
matches = re.match('%s :(\d+)\n' % refname, self._currentline)
|
|
if matches:
|
|
# We translate the parent commit mark to what it needs to be in
|
|
# our mark namespace
|
|
baseref = _IDS.translate( int(matches.group(1))+self._id_offset )
|
|
self._advance_currentline()
|
|
else:
|
|
matches = re.match('%s ([0-9a-f]{40})\n' % refname, self._currentline)
|
|
if matches:
|
|
baseref = matches.group(1)
|
|
self._advance_currentline()
|
|
return baseref
|
|
|
|
def _parse_optional_filechange(self):
|
|
"""
|
|
If the current line contains a file-change object, then parse it
|
|
and advance the current line; otherwise return None. We only care
|
|
about file changes of type 'M' and 'D' (these are the only types
|
|
of file-changes that fast-export will provide).
|
|
"""
|
|
filechange = None
|
|
if self._currentline.startswith('M '):
|
|
(mode, idnum, path) = \
|
|
re.match('M (\d+) (?::?([0-9a-f]{40}|\d+)) (.*)\n$',
|
|
self._currentline).groups()
|
|
# We translate the idnum to our id system
|
|
if len(idnum) != 40:
|
|
idnum = _IDS.translate( int(idnum)+self._id_offset )
|
|
if idnum is not None:
|
|
if path.startswith('"'):
|
|
path = unquote(path)
|
|
filechange = FileChanges('M', path, idnum, mode)
|
|
else:
|
|
filechange = 'skipped'
|
|
self._advance_currentline()
|
|
elif self._currentline.startswith('D '):
|
|
path = self._currentline[2:-1]
|
|
if path.startswith('"'):
|
|
path = unquote(path)
|
|
filechange = FileChanges('D', path)
|
|
self._advance_currentline()
|
|
elif self._currentline.startswith('R '):
|
|
rest = self._currentline[2:-1]
|
|
if rest.startswith('"'):
|
|
m = re.match(r'"(?:[^"\\]|\\.)*"', rest)
|
|
if not m:
|
|
raise SystemExit("Couldn't parse rename source")
|
|
orig = unquote(m.group(0))
|
|
new = rest[m.end()+1:]
|
|
else:
|
|
orig, new = rest.split(' ', 1)
|
|
if new.startswith('"'):
|
|
new = unquote(new)
|
|
filechange = FileChanges('R', orig, new)
|
|
self._advance_currentline()
|
|
return filechange
|
|
|
|
def _parse_original_id(self):
|
|
original_id = self._currentline[len('original-oid '):].rstrip()
|
|
self._advance_currentline()
|
|
return original_id
|
|
|
|
def _parse_ref_line(self, refname):
|
|
"""
|
|
Parses string data (often a branch name) from current-line. The name of
|
|
the string data must match the refname arg. The program will crash if
|
|
current-line does not match, so current-line will always be advanced if
|
|
this method returns.
|
|
"""
|
|
matches = re.match('%s (.*)\n$' % refname, self._currentline)
|
|
if not matches:
|
|
raise SystemExit("Malformed %s line: '%s'" %
|
|
(refname, self._currentline))
|
|
ref = matches.group(1)
|
|
self._advance_currentline()
|
|
return ref
|
|
|
|
def _parse_user(self, usertype):
|
|
"""
|
|
Get user name, email, datestamp from current-line. Current-line will
|
|
be advanced.
|
|
"""
|
|
(name, email, when) = \
|
|
re.match('%s (.*?) <(.*?)> (.*)\n$' %
|
|
usertype, self._currentline).groups()
|
|
|
|
# Translate when into a datetime object, with corresponding timezone info
|
|
(unix_timestamp, tz_offset) = when.split()
|
|
datestamp = datetime.fromtimestamp(int(unix_timestamp),
|
|
FixedTimeZone(tz_offset))
|
|
|
|
self._advance_currentline()
|
|
return (name, email, datestamp)
|
|
|
|
def _parse_data(self):
|
|
"""
|
|
Reads data from _input. Current-line will be advanced until it is beyond
|
|
the data.
|
|
"""
|
|
size = int(re.match('data (\d+)\n$', self._currentline).group(1))
|
|
data = self._input.read(size)
|
|
self._advance_currentline()
|
|
if self._currentline == '\n':
|
|
self._advance_currentline()
|
|
return data
|
|
|
|
def _parse_blob(self):
|
|
"""
|
|
Parse input data into a Blob object. Once the Blob has been created, it
|
|
will be handed off to the appropriate callbacks. Current-line will be
|
|
advanced until it is beyond this blob's data. The Blob will be dumped
|
|
to _output once everything else is done (unless it has been skipped by
|
|
the callback).
|
|
"""
|
|
# Parse the Blob
|
|
self._advance_currentline()
|
|
id_ = self._parse_optional_mark()
|
|
|
|
original_id = None
|
|
if self._currentline.startswith('original-oid'):
|
|
original_id = self._parse_original_id();
|
|
|
|
data = self._parse_data()
|
|
if self._currentline == '\n':
|
|
self._advance_currentline()
|
|
|
|
# Create the blob
|
|
blob = Blob(data, original_id)
|
|
|
|
# If fast-export text had a mark for this blob, need to make sure this
|
|
# mark translates to the blob's true id.
|
|
if id_:
|
|
blob.old_id = id_
|
|
_IDS.record_rename(id_, blob.id)
|
|
|
|
# Call any user callback to allow them to use/modify the blob
|
|
if self._blob_callback:
|
|
self._blob_callback(blob)
|
|
if self._everything_callback:
|
|
self._everything_callback('blob', blob)
|
|
|
|
# Now print the resulting blob
|
|
if not blob.dumped:
|
|
blob.dump(self._output)
|
|
|
|
def _parse_reset(self):
|
|
"""
|
|
Parse input data into a Reset object. Once the Reset has been created,
|
|
it will be handed off to the appropriate callbacks. Current-line will
|
|
be advanced until it is beyond the reset data. The Reset will be dumped
|
|
to _output once everything else is done (unless it has been skipped by
|
|
the callback).
|
|
"""
|
|
# Parse the Reset
|
|
ref = self._parse_ref_line('reset')
|
|
self._seen_refs[ref] = None
|
|
from_ref = self._parse_optional_parent_ref('from')
|
|
if self._currentline == '\n':
|
|
self._advance_currentline()
|
|
|
|
# Create the reset
|
|
reset = Reset(ref, from_ref)
|
|
|
|
# Call any user callback to allow them to modify the reset
|
|
if self._reset_callback:
|
|
self._reset_callback(reset)
|
|
if self._everything_callback:
|
|
self._everything_callback('reset', reset)
|
|
|
|
# Now print the resulting reset
|
|
if not reset.dumped:
|
|
reset.dump(self._output)
|
|
|
|
def _translate_commit_hash(self, matchobj):
|
|
old_hash = matchobj.group(1)
|
|
orig_len = len(old_hash)
|
|
if old_hash not in self._commit_renames:
|
|
if old_hash[0:7] not in self._commit_short_old_hashes:
|
|
return old_hash
|
|
possibilities = self._commit_short_old_hashes[old_hash[0:7]]
|
|
matches = [x for x in possibilities
|
|
if x[0:orig_len] == old_hash]
|
|
if len(matches) != 1:
|
|
return old_hash
|
|
old_hash = matches[0]
|
|
|
|
new_hash = self._commit_renames[old_hash]
|
|
if new_hash is None:
|
|
self._commits_referenced_but_removed.add(old_hash)
|
|
return old_hash[0:orig_len]
|
|
else:
|
|
return new_hash[0:orig_len]
|
|
|
|
def num_commits_parsed(self):
|
|
return self._num_commits
|
|
|
|
def _show_progress(self, force=False):
|
|
if not self._quiet:
|
|
now = time.time()
|
|
if force or now - self._last_progress_update > .1:
|
|
self._last_progress_update = now
|
|
print("\rParsed {} commits".format(self._num_commits), end='')
|
|
if force:
|
|
print("\n")
|
|
|
|
def _parse_commit(self, fast_import_pipes):
|
|
"""
|
|
Parse input data into a Commit object. Once the Commit has been created,
|
|
it will be handed off to the appropriate callbacks. Current-line will
|
|
be advanced until it is beyond the commit data. The Commit will be dumped
|
|
to _output once everything else is done (unless it has been skipped by
|
|
the callback OR the callback has removed all file-changes from the commit).
|
|
"""
|
|
# Parse the Commit. This may look involved, but it's pretty simple; it only
|
|
# looks bad because a commit object contains many pieces of data.
|
|
branch = self._parse_ref_line('commit')
|
|
self._seen_refs[branch] = None
|
|
id_ = self._parse_optional_mark()
|
|
|
|
original_id = None
|
|
if self._currentline.startswith('original-oid'):
|
|
original_id = self._parse_original_id();
|
|
|
|
author_name = None
|
|
if self._currentline.startswith('author'):
|
|
(author_name, author_email, author_date) = self._parse_user('author')
|
|
|
|
(committer_name, committer_email, committer_date) = \
|
|
self._parse_user('committer')
|
|
|
|
if not author_name:
|
|
(author_name, author_email, author_date) = \
|
|
(committer_name, committer_email, committer_date)
|
|
|
|
commit_msg = self._parse_data()
|
|
commit_msg = re.sub(r'(\b[0-9a-f]{7,40}\b)',
|
|
self._translate_commit_hash,
|
|
commit_msg)
|
|
|
|
parents = []
|
|
parents.append(self._parse_optional_parent_ref('from'))
|
|
merge_ref = self._parse_optional_parent_ref('merge')
|
|
while merge_ref:
|
|
parents.append(merge_ref)
|
|
merge_ref = self._parse_optional_parent_ref('merge')
|
|
|
|
was_merge = len(parents) > 1
|
|
# Remove redundant parents (if both sides of history are empty commits,
|
|
# the most recent ancestor on both sides may be the same commit).
|
|
parents = collections.OrderedDict.fromkeys(parents).keys()
|
|
|
|
# Flatten unnecessary merges. (If one side of history is entirely
|
|
# empty commits that were pruned, we may end up attempting to
|
|
# merge a commit with its ancestor. Remove parents that are an
|
|
# ancestor of another parent.)
|
|
num_original_parents = len(parents)
|
|
check_merge_now_empty = False
|
|
if num_original_parents > 1:
|
|
to_remove = []
|
|
for cur in xrange(num_original_parents):
|
|
for other in xrange(num_original_parents):
|
|
if cur != other and self._graph.is_ancestor(parents[cur],
|
|
parents[other]):
|
|
to_remove.append(cur)
|
|
for x in reversed(to_remove):
|
|
parents.pop(x)
|
|
if len(parents) == 1:
|
|
check_merge_now_empty = True
|
|
|
|
# Record our new parents after above pruning of parents representing
|
|
# pruned empty histories
|
|
from_commit = parents[0]
|
|
merge_commits = parents[1:]
|
|
|
|
file_changes = []
|
|
file_change = self._parse_optional_filechange()
|
|
had_file_changes = file_change is not None or was_merge
|
|
while file_change:
|
|
if not (type(file_change) == str and file_change == 'skipped'):
|
|
file_changes.append(file_change)
|
|
file_change = self._parse_optional_filechange()
|
|
if self._currentline == '\n':
|
|
self._advance_currentline()
|
|
|
|
# If we had a merge commit and the first parent history back to the
|
|
# merge base was entirely composed of commits made empty by our
|
|
# filtering, it is likely that this merge commit is empty and can be
|
|
# pruned too. Check by comparing the contents of this merge to its
|
|
# remaining parent.
|
|
#
|
|
# NOTES on why/how this works:
|
|
# 1. fast-export always gives file changes in a merge commit relative
|
|
# to the first parent.
|
|
# 2. The only way this 'if' is active is when the first parent was
|
|
# an ancestor of what is now the only remaining parent
|
|
# 3. The two above imply that the file changes we're looking at are
|
|
# just for the line of history for the remaining parent, and show
|
|
# all changes needed to make the original first parent (whose tree
|
|
# matched an ancestor of the remaining parent) match the merge's tree.
|
|
# 4. If the versions of all specified files in the remaining parent
|
|
# match the file change versions, then this "merge" commit is
|
|
# actually going to be an empty non-merge commit and we should prune
|
|
# it.
|
|
if check_merge_now_empty and fast_import_pipes:
|
|
unnecessary_filechanges = set()
|
|
fi_input, fi_output = fast_import_pipes
|
|
# Optimization note: we could have two loops over file_changes, the
|
|
# first doing all the fi_input.write() calls, and the second doing the
|
|
# rest. But I'm worried about fast-import blocking on fi_output
|
|
# buffers filling up so I instead read from it as I go.
|
|
for change in file_changes:
|
|
fi_input.write("ls :{} {}\n".format(from_commit, change.filename))
|
|
parent_version = fi_output.readline().split()
|
|
if change.type == 'D':
|
|
if parent_version == ['missing', change.filename]:
|
|
unnecessary_filechanges.add(change)
|
|
else:
|
|
blob_sha = change.blob_id
|
|
if isinstance(change.blob_id, int):
|
|
fi_input.write("get-mark :{}\n".format(change.blob_id))
|
|
blob_sha = fi_output.readline().rstrip()
|
|
if parent_version == [change.mode, 'blob', blob_sha, change.filename]:
|
|
unnecessary_filechanges.add(change)
|
|
file_changes = [change for change in file_changes
|
|
if change not in unnecessary_filechanges]
|
|
|
|
# Okay, now we can finally create the Commit object
|
|
commit = Commit(branch,
|
|
author_name, author_email, author_date,
|
|
committer_name, committer_email, committer_date,
|
|
commit_msg,
|
|
file_changes,
|
|
from_commit,
|
|
merge_commits,
|
|
original_id,
|
|
stream_number = _CURRENT_STREAM_NUMBER)
|
|
|
|
# If fast-export text had a mark for this commit, need to make sure this
|
|
# mark translates to the commit's true id.
|
|
if id_:
|
|
commit.old_id = id_
|
|
_IDS.record_rename(id_, commit.id)
|
|
|
|
# Record ancestry graph
|
|
self._graph.add_commit_and_parents(commit.id, commit.get_parents())
|
|
|
|
# Call any user callback to allow them to modify the commit
|
|
if self._commit_callback:
|
|
self._commit_callback(commit)
|
|
if self._everything_callback:
|
|
self._everything_callback('commit', commit)
|
|
|
|
# Now print the resulting commit, unless all its changes were dropped and
|
|
# it was a non-merge commit
|
|
merge_commit = len(parents) > 1
|
|
if not commit.dumped:
|
|
if (commit.file_changes or merge_commit or
|
|
(not had_file_changes and len(parents) >= 1)):
|
|
commit.dump(self._output)
|
|
new_id = None
|
|
# Record the mapping of old commit hash to new one
|
|
if commit.original_id and fast_import_pipes:
|
|
fi_input, fi_output = fast_import_pipes
|
|
fi_input.write("get-mark :{}\n".format(commit.id))
|
|
orig_id = commit.original_id
|
|
new_id = fi_output.readline().rstrip()
|
|
self._commit_renames[orig_id] = new_id
|
|
self._commit_short_old_hashes[orig_id[0:7]].add(orig_id)
|
|
# Now, record if this was a merge commit that turned into a non-merge
|
|
# commit.
|
|
if num_original_parents > 1 and not merge_commit:
|
|
self._commits_no_longer_merges.append((orig_id, new_id))
|
|
else:
|
|
# We skip empty commits, but want to keep track to make sure we don't
|
|
# lose any refs this way.
|
|
self._seen_refs[branch] = commit.first_parent()
|
|
commit.skip(commit.first_parent())
|
|
self._commit_renames[commit.original_id] = None
|
|
self._num_commits += 1
|
|
self._show_progress()
|
|
|
|
def _parse_tag(self):
|
|
"""
|
|
Parse input data into a Tag object. Once the Tag has been created,
|
|
it will be handed off to the appropriate callbacks. Current-line will
|
|
be advanced until it is beyond the tag data. The Tag will be dumped
|
|
to _output once everything else is done (unless it has been skipped by
|
|
the callback).
|
|
"""
|
|
# Parse the Tag
|
|
tag = self._parse_ref_line('tag')
|
|
from_ref = self._parse_optional_parent_ref('from')
|
|
if from_ref is None:
|
|
raise SystemExit("Expected 'from' line while parsing tag %s" % tag)
|
|
|
|
original_id = None
|
|
if self._currentline.startswith('original-oid'):
|
|
original_id = self._parse_original_id();
|
|
|
|
(tagger_name, tagger_email, tagger_date) = self._parse_user('tagger')
|
|
tag_msg = self._parse_data()
|
|
if self._currentline == '\n':
|
|
self._advance_currentline()
|
|
|
|
# Create the tag
|
|
tag = Tag(tag, from_ref,
|
|
tagger_name, tagger_email, tagger_date, tag_msg,
|
|
original_id)
|
|
|
|
# Call any user callback to allow them to modify the tag
|
|
if self._tag_callback:
|
|
self._tag_callback(tag)
|
|
if self._everything_callback:
|
|
self._everything_callback('tag', tag)
|
|
|
|
# Now print the resulting reset
|
|
if not tag.dumped:
|
|
tag.dump(self._output)
|
|
|
|
def _parse_progress(self):
|
|
"""
|
|
Parse input data into a Progress object. Once the Progress has
|
|
been created, it will be handed off to the appropriate
|
|
callbacks. Current-line will be advanced until it is beyond the
|
|
progress data. The Progress will be dumped to _output once
|
|
everything else is done (unless it has been skipped by the callback).
|
|
"""
|
|
# Parse the Progress
|
|
message = self._parse_ref_line('progress')
|
|
if self._currentline == '\n':
|
|
self._advance_currentline()
|
|
|
|
# Create the progress message
|
|
progress = Progress(message)
|
|
|
|
# Call any user callback to allow them to modify the progress messsage
|
|
if self._progress_callback:
|
|
self._progress_callback(progress)
|
|
if self._everything_callback:
|
|
self._everything_callback('progress', progress)
|
|
|
|
# Now print the resulting progress message
|
|
if not progress.dumped:
|
|
progress.dump(self._output)
|
|
|
|
def _parse_checkpoint(self):
|
|
"""
|
|
Parse input data into a Checkpoint object. Once the Checkpoint has
|
|
been created, it will be handed off to the appropriate
|
|
callbacks. Current-line will be advanced until it is beyond the
|
|
checkpoint data. The Checkpoint will be dumped to _output once
|
|
everything else is done (unless it has been skipped by the callback).
|
|
"""
|
|
# Parse the Checkpoint
|
|
self._advance_currentline()
|
|
if self._currentline == '\n':
|
|
self._advance_currentline()
|
|
|
|
# Create the checkpoint
|
|
checkpoint = Checkpoint()
|
|
|
|
# Call any user callback to allow them to drop the checkpoint
|
|
if self._checkpoint_callback:
|
|
self._checkpoint_callback(checkpoint)
|
|
if self._everything_callback:
|
|
self._everything_callback('checkpoint', checkpoint)
|
|
|
|
# Now print the resulting checkpoint
|
|
if not checkpoint.dumped:
|
|
checkpoint.dump(self._output)
|
|
|
|
def _parse_literal_command(self):
|
|
"""
|
|
Parse literal command. Just dump the line as is.
|
|
"""
|
|
# Create the literal command object
|
|
command = LiteralCommand(self._currentline)
|
|
self._advance_currentline()
|
|
|
|
# Now print the resulting checkpoint
|
|
if not command.dumped:
|
|
command.dump(self._output)
|
|
|
|
def _handle_final_commands(self):
|
|
self._finalize_handled = True
|
|
for ref, value in self._seen_refs.iteritems():
|
|
if value is not None:
|
|
# Create a reset
|
|
reset = Reset(ref, value)
|
|
|
|
# Call any user callback to allow them to modify the reset
|
|
if self._reset_callback:
|
|
self._reset_callback(reset)
|
|
if self._everything_callback:
|
|
self._everything_callback('reset', reset)
|
|
|
|
# Now print the resulting reset
|
|
reset.dump(self._output)
|
|
|
|
def record_metadata(self, metadata_dir, orig_refs, refs_nuked):
|
|
deleted_hash = '0'*40
|
|
with open(os.path.join(metadata_dir, 'commit-map'), 'w') as f:
|
|
f.write("old new\n")
|
|
for (old,new) in self._commit_renames.iteritems():
|
|
f.write('{} {}\n'.format(old, new if new != None else deleted_hash))
|
|
|
|
batch_check_process = None
|
|
with open(os.path.join(metadata_dir, 'ref-map'), 'w') as f:
|
|
for refname, old_hash in orig_refs.iteritems():
|
|
if refname in refs_nuked:
|
|
new_hash = deleted_hash
|
|
elif old_hash in self._commit_renames:
|
|
new_hash = self._commit_renames[old_hash]
|
|
new_hash = new_hash if new_hash != None else deleted_hash
|
|
else: # Must be an annotated tag
|
|
if not batch_check_process:
|
|
cmd = 'git cat-file --batch-check'.split()
|
|
batch_check_process = subprocess.Popen(cmd,
|
|
stdin=subprocess.PIPE,
|
|
stdout=subprocess.PIPE)
|
|
batch_check_process.stdin.write(refname+"\n")
|
|
line = batch_check_process.stdout.readline()
|
|
m = re.match('^([0-9a-f]{40}) ([a-z]+) ([0-9]+)$', line)
|
|
if not m or m.group(2) != 'tag':
|
|
raise SystemExit("Failed to find new id for {} (old id was {})"
|
|
.format(refname, old_hash))
|
|
new_hash = m.group(1)
|
|
f.write('{} {} {}\n'.format(old_hash, new_hash, refname))
|
|
if batch_check_process:
|
|
batch_check_process.stdin.close()
|
|
batch_check_process.wait()
|
|
|
|
with open(os.path.join(metadata_dir, 'suboptimal-issues'), 'w') as f:
|
|
issues_found = False
|
|
if self._commits_no_longer_merges:
|
|
issues_found = True
|
|
|
|
f.write(textwrap.dedent('''
|
|
The following commits used to be merge commits but due to filtering
|
|
are now regular commits; they likely have suboptimal commit messages
|
|
(e.g. "Merge branch next into master"). Original commit hash on the
|
|
left, commit hash after filtering/rewriting on the right:
|
|
'''[1:]))
|
|
for oldhash, newhash in self._commits_no_longer_merges:
|
|
f.write(' {} {}\n'.format(oldhash, newhash))
|
|
f.write('\n')
|
|
|
|
if self._commits_referenced_but_removed:
|
|
issues_found = True
|
|
f.write(textwrap.dedent('''
|
|
The following commits were filtered out, but referenced in another
|
|
commit message. The reference to the now-nonexistent commit hash
|
|
(or a substring thereof) was left as-is in any commit messages:
|
|
'''[1:]))
|
|
for bad_commit_reference in self._commits_referenced_but_removed:
|
|
f.write(' {}\n'.format(bad_commit_reference))
|
|
f.write('\n')
|
|
|
|
if not issues_found:
|
|
f.write("No filtering problems encountered.")
|
|
|
|
def get_seen_refs(self):
|
|
return self._seen_refs.keys()
|
|
|
|
def run(self, *args, **kwargs):
|
|
"""
|
|
This method performs the filter. The method optionally takes two arguments.
|
|
The first represents the source repository (either a file object
|
|
containing git-fast-export output, or a string containing the path to the
|
|
source repository where we can run git-fast-export), and the second
|
|
argument represents the target repository (again either a file object into
|
|
which we write git-fast-import input, or a string containing the path to
|
|
the source repository where we can run git-fast-import).
|
|
"""
|
|
# Sanity check arguments
|
|
if len(args) != 0 and len(args) != 2:
|
|
raise SystemExit("run() must be called with 0 or 2 arguments")
|
|
if type(args[0]) != str and not (
|
|
hasattr(args[0], 'read') and hasattr(args[0], 'readline')):
|
|
raise SystemExit("arguments to run() must be filenames or files")
|
|
if type(args[1]) != str and not (
|
|
hasattr(args[1], 'write') and hasattr(args[1], 'close')):
|
|
raise SystemExit("arguments to run() must be filenames or files")
|
|
|
|
# Set input. If no args provided, use stdin.
|
|
self._input = sys.stdin
|
|
if len(args) > 0:
|
|
if type(args[0]) == str:
|
|
# If repo-name provided, set up fast_export process pipe as input
|
|
self._input = fast_export_output(args[0]).stdout
|
|
else:
|
|
# If file-obj provided, just use that
|
|
self._input = args[0]
|
|
|
|
# Set output. If no args provided, use stdout.
|
|
self._output = sys.stdout
|
|
output_pipe = None
|
|
need_wait = False
|
|
if len(args) > 1:
|
|
if type(args[1]) == str:
|
|
# If repo-name provided, output to fast_import process pipe
|
|
output_pipe = fast_import_input(args[1])
|
|
self._output = output_pipe.stdin
|
|
need_wait = True
|
|
else:
|
|
# If file-obj provided, just use that
|
|
self._output = args[1]
|
|
|
|
# Show progress by default
|
|
self._quiet = kwargs.get('quiet', False)
|
|
|
|
# Setup some vars
|
|
global _CURRENT_STREAM_NUMBER
|
|
|
|
_CURRENT_STREAM_NUMBER += 1
|
|
if _CURRENT_STREAM_NUMBER > 1:
|
|
self._id_offset = _IDS._next_id-1
|
|
|
|
# Run over the input and do the filtering
|
|
self._advance_currentline()
|
|
while self._currentline:
|
|
if self._currentline.startswith('blob'):
|
|
self._parse_blob()
|
|
elif self._currentline.startswith('reset'):
|
|
self._parse_reset()
|
|
elif self._currentline.startswith('commit'):
|
|
self._parse_commit(kwargs.get('fast_import_pipes', None))
|
|
elif self._currentline.startswith('tag'):
|
|
self._parse_tag()
|
|
elif self._currentline.startswith('progress'):
|
|
self._parse_progress()
|
|
elif self._currentline.startswith('checkpoint'):
|
|
self._parse_checkpoint()
|
|
elif self._currentline.startswith('feature'):
|
|
self._parse_literal_command()
|
|
elif self._currentline.startswith('option'):
|
|
self._parse_literal_command()
|
|
elif self._currentline.startswith('done'):
|
|
self._handle_final_commands()
|
|
self._parse_literal_command()
|
|
elif self._currentline.startswith('#'):
|
|
self._parse_literal_command()
|
|
elif self._currentline.startswith('get-mark') or \
|
|
self._currentline.startswith('cat-blob') or \
|
|
self._currentline.startswith('ls'):
|
|
raise SystemExit("Unsupported command: '%s'" % self._currentline)
|
|
else:
|
|
raise SystemExit("Could not parse line: '%s'" % self._currentline)
|
|
|
|
self._show_progress(force = True)
|
|
if not self._finalize_handled:
|
|
self._handle_final_commands()
|
|
|
|
# If we created fast_import process, close pipe and wait for it to finish
|
|
if need_wait:
|
|
self._output.close()
|
|
output_pipe.wait()
|
|
|
|
def fast_export_output(source_repo, extra_args = None):
|
|
"""
|
|
Given a source-repo location, setup a Popen process that runs fast-export
|
|
on that repo. The Popen object is returned (we do NOT wait for it to
|
|
finish).
|
|
"""
|
|
if not extra_args:
|
|
extra_args = ["--all"]
|
|
|
|
# If the client specified an import-marks file, we find the biggest mark
|
|
# within that file and make sure that _IDS generates new marks that are
|
|
# at least higher than that.
|
|
for arg in extra_args:
|
|
if arg.startswith("--import-marks"):
|
|
filename = arg[len("--import-marks="):]
|
|
lines = open(filename,'r').read().strip().splitlines()
|
|
if lines:
|
|
biggest_mark = max([int(line.split()[0][1:]) for line in lines])
|
|
_IDS._avoid_ids_below(biggest_mark)
|
|
|
|
# Create and return the git process
|
|
extra_args.append('--use-done-feature')
|
|
return subprocess.Popen(["git", "fast-export", "--topo-order"] + extra_args,
|
|
stdout=subprocess.PIPE,
|
|
cwd=source_repo)
|
|
|
|
def fast_import_input(target_repo, extra_args = None):
|
|
"""
|
|
Given a target-repo location, setup a Popen process that runs fast-import
|
|
on that repo. The Popen object is returned (we do NOT wait for it to
|
|
finish).
|
|
"""
|
|
if extra_args is None:
|
|
extra_args = []
|
|
|
|
# If target-repo directory does not exist, create it and initialize it
|
|
if not os.path.isdir(target_repo):
|
|
os.makedirs(target_repo)
|
|
if subprocess.call(["git", "init", "--shared"], cwd=target_repo) != 0:
|
|
raise SystemExit("git init in %s failed!" % target_repo)
|
|
|
|
# Create and return the git process
|
|
return subprocess.Popen(["git", "fast-import", "--quiet"] + extra_args,
|
|
stdin=subprocess.PIPE,
|
|
cwd=target_repo)
|
|
|
|
def get_commit_count(repo, *args):
|
|
"""
|
|
Return the number of commits that have been made on repo.
|
|
"""
|
|
if not args:
|
|
args = ['--all']
|
|
if len(args) == 1 and isinstance(args[0], list):
|
|
args = args[0]
|
|
p1 = subprocess.Popen(["git", "rev-list"] + args,
|
|
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
|
cwd=repo)
|
|
p2 = subprocess.Popen(["wc", "-l"], stdin=p1.stdout, stdout=subprocess.PIPE)
|
|
count = int(p2.communicate()[0])
|
|
if p1.poll() != 0:
|
|
raise SystemExit("%s does not appear to be a valid git repository" % repo)
|
|
return count
|
|
|
|
def get_total_objects(repo):
|
|
"""
|
|
Return the number of objects (both packed and unpacked)
|
|
"""
|
|
p1 = subprocess.Popen(["git", "count-objects", "-v"],
|
|
stdout=subprocess.PIPE, cwd=repo)
|
|
lines = p1.stdout.read().splitlines()
|
|
# Return unpacked objects + packed-objects
|
|
return int(lines[0].split()[1]) + int(lines[2].split()[1])
|
|
|
|
def record_id_rename(old_id, new_id):
|
|
"""
|
|
Register a new translation
|
|
"""
|
|
handle_transitivity = True
|
|
_IDS.record_rename(old_id, new_id, handle_transitivity)
|
|
|
|
# Internal globals
|
|
_IDS = _IDs()
|
|
_EXTRA_CHANGES = {} # idnum -> list of list of FileChanges
|
|
_CURRENT_STREAM_NUMBER = 0
|
|
|
|
######################################################################
|
|
|
|
class AppendFilter(argparse.Action):
|
|
def __call__(self, parser, namespace, values, option_string=None):
|
|
suffix = option_string[len('--path-'):] or 'match'
|
|
if suffix == 'rename':
|
|
mod_type = 'rename'
|
|
match_type = 'prefix'
|
|
elif suffix.startswith('rename-'):
|
|
mod_type = 'rename'
|
|
match_type = suffix[len('rename-'):]
|
|
else:
|
|
mod_type = 'filter'
|
|
match_type = suffix
|
|
items = getattr(namespace, self.dest, []) or []
|
|
items.append((mod_type, match_type, values))
|
|
setattr(namespace, self.dest, items)
|
|
|
|
def get_args():
|
|
parser = argparse.ArgumentParser(description='Rewrite repository history')
|
|
# FIXME: Need to special case all --* args that rev-list takes, or call
|
|
# git rev-parse ...
|
|
parser.add_argument('--analyze', action='store_true',
|
|
help='''Analyze repository history and create a report
|
|
that may be useful in determining what to
|
|
filter in a subsequent run.''')
|
|
parser.add_argument('--force', '-f', action='store_true',
|
|
help='''Rewrite history even if the current repo does not
|
|
look like a fresh clone.''')
|
|
|
|
path_group = parser.add_argument_group(title='Filtering based on paths')
|
|
path_group.add_argument('--invert-paths', action='store_false',
|
|
dest='inclusive',
|
|
help='''Invert the selection of files from the
|
|
specified --path-{match,glob,regex} options
|
|
below, i.e. only select files matching none
|
|
of those options.''')
|
|
|
|
path_group.add_argument('--path-match', '--path', metavar='DIR_OR_FILE',
|
|
action=AppendFilter, dest='path_changes',
|
|
help='''Exact paths (files or directories) to include
|
|
in filtered history. Multiple --path
|
|
options can be specified to get a union of
|
|
paths.''')
|
|
path_group.add_argument('--path-glob', metavar='GLOB',
|
|
action=AppendFilter, dest='path_changes',
|
|
help='''Glob of paths to include in filtered
|
|
history. Multiple --path-glob options can
|
|
be specified to get a union of paths.''')
|
|
path_group.add_argument('--path-regex', metavar='REGEX',
|
|
action=AppendFilter, dest='path_changes',
|
|
help='''Regex of paths to include in filtered
|
|
history. Multiple --path-regex options can
|
|
be specified to get a union of paths''')
|
|
|
|
rename_group = parser.add_argument_group(title='Renaming based on paths')
|
|
rename_group.add_argument('--path-rename', '--path-rename-prefix',
|
|
metavar='OLD_NAME:NEW_NAME',
|
|
action=AppendFilter, dest='path_changes',
|
|
help='''Prefix to rename; if filename starts with
|
|
OLD_NAME, replace that with NEW_NAME.
|
|
Multiple --path-rename options can be
|
|
specified.''')
|
|
|
|
parser.add_argument('--dry-run', action='store_true',
|
|
help='''Do not change the repository. Run `git
|
|
fast-export` and filter its output, and save
|
|
both the original and the filtered version for
|
|
comparison. Some filtering of empty commits
|
|
may not occur due to inability to query the
|
|
fast-import backend.''')
|
|
parser.add_argument('--debug', action='store_true',
|
|
help='''Print additional information about operations
|
|
being performed and commands being run. When
|
|
used together with --dry-run, also show extra
|
|
information about what would be run.''')
|
|
parser.add_argument('--stdin', action='store_true',
|
|
help='''Instead of running `git fast-export` and filtering
|
|
its output, filter the fast-export stream from
|
|
stdin.''')
|
|
|
|
|
|
parser.add_argument('--quiet', action='store_true',
|
|
help='''Pass --quiet to other git commands called''')
|
|
|
|
parser.add_argument('revisions', nargs='*',
|
|
help='''Branches/tags/refs to rewrite. Special rev-list
|
|
options, such as --branches, --tags, --all,
|
|
--glob, or --exclude are allowed. [default:
|
|
--all]''')
|
|
if len(sys.argv) == 1:
|
|
parser.print_usage()
|
|
raise SystemExit("No arguments specified.")
|
|
args = parser.parse_args()
|
|
if not args.revisions:
|
|
args.revisions = ['--all']
|
|
if args.analyze and args.path_changes:
|
|
raise SystemExit("Error: --analyze is incompatible with --path* flags; "
|
|
"it's a read-only operation.")
|
|
if args.analyze and args.stdin:
|
|
raise SystemExit("Error: --analyze is incompatible with --stdin.")
|
|
# If no path_changes are found, initialize with empty list but mark as
|
|
# not inclusive so that all files match
|
|
if args.path_changes == None:
|
|
args.path_changes = []
|
|
args.inclusive = False
|
|
# Similarly, if we only have renames, all paths should match
|
|
else:
|
|
has_filter = False
|
|
for (mod_type, match_type, path_expression) in args.path_changes:
|
|
if mod_type == 'filter':
|
|
has_filter = True
|
|
if not has_filter:
|
|
args.inclusive = False
|
|
return args
|
|
|
|
def is_repository_bare():
|
|
output = subprocess.check_output('git rev-parse --is-bare-repository'.split())
|
|
return (output.strip() == 'true')
|
|
|
|
def determine_git_dir():
|
|
output = subprocess.check_output('git rev-parse --git-dir'.split())
|
|
return output.strip()
|
|
|
|
def sanity_check(refs, is_bare):
|
|
def abort(reason):
|
|
raise SystemExit(
|
|
"Aborting: Refusing to overwrite repo history since this does not\n"
|
|
"look like a fresh clone.\n"
|
|
" ("+reason+")\n"
|
|
"To override, use --force.")
|
|
|
|
# Make sure repo is fully packed, just like a fresh clone would be
|
|
output = subprocess.check_output('git count-objects -v'.split())
|
|
stats = dict(x.split(': ') for x in output.splitlines())
|
|
if stats['count'] != '0' or stats['packs'] != '1':
|
|
abort("expected freshly packed repo")
|
|
|
|
# Make sure there is precisely one remote, named "origin"
|
|
output = subprocess.check_output('git remote'.split()).strip()
|
|
if output != "origin":
|
|
abort("expected one remote, origin")
|
|
|
|
# Avoid letting people running with weird setups and overwriting GIT_DIR
|
|
# elsewhere
|
|
git_dir = determine_git_dir()
|
|
if is_bare and git_dir != '.':
|
|
abort("GIT_DIR must be .")
|
|
elif not is_bare and git_dir != '.git':
|
|
abort("GIT_DIR must be .git")
|
|
|
|
# Make sure that all reflogs have precisely one entry
|
|
reflog_dir=os.path.join(git_dir, 'logs')
|
|
for root, dirs, files in os.walk(reflog_dir):
|
|
for filename in files:
|
|
pathname = os.path.join(root, filename)
|
|
with open(pathname) as f:
|
|
if len(f.read().splitlines()) > 1:
|
|
shortpath = pathname[len(reflog_dir)+1:]
|
|
abort("expected at most one entry in the reflog for " + shortpath)
|
|
|
|
# Make sure there are no stashed changes
|
|
if 'refs/stash' in refs:
|
|
abort("has stashed changes")
|
|
|
|
# Do extra checks in non-bare repos
|
|
if not is_bare:
|
|
# Avoid uncommitted, unstaged, or untracked changes
|
|
if subprocess.call('git diff --staged'.split()):
|
|
abort("you have uncommitted changes")
|
|
if subprocess.call('git diff --quiet'.split()):
|
|
abort("you have unstaged changes")
|
|
if len(subprocess.check_output('git ls-files -o'.split())) > 0:
|
|
abort("you have untracked changes")
|
|
|
|
# Avoid unpushed changes
|
|
for refname, rev in refs.iteritems():
|
|
if not refname.startswith('refs/heads/'):
|
|
continue
|
|
origin_ref = refname.replace('refs/heads/', 'refs/remotes/origin/')
|
|
if origin_ref not in refs:
|
|
abort('{} exists, but {} not found'.format(refname, origin_ref))
|
|
if rev != refs[origin_ref]:
|
|
abort('{} does not match {}'.format(refname, origin_ref))
|
|
|
|
def get_refs():
|
|
try:
|
|
output = subprocess.check_output('git show-ref'.split())
|
|
except subprocess.CalledProcessError as e:
|
|
# If error code is 1, there just isn't any refs; i.e. bare repo.
|
|
# If error code is other than 1, some other error (e.g. not a git repo)
|
|
if e.returncode != 1:
|
|
raise SystemExit('fatal: {}'.format(e))
|
|
output = ''
|
|
return dict(reversed(x.split()) for x in output.splitlines())
|
|
|
|
def analyze_commit(args, commit):
|
|
def equiv_class(filename):
|
|
return args.stats['equivalence'].get(filename, (filename,))
|
|
|
|
for change in commit.file_changes:
|
|
if change.mode == '160000':
|
|
continue
|
|
if change.type == 'D':
|
|
# Track when files are deleted; see 'R' below about equiv_class
|
|
for f in equiv_class(change.filename):
|
|
args.stats['deletions'][f] = commit.committer_date
|
|
elif change.type == 'R':
|
|
# Since we want to know when files are deleted, renames make it slightly
|
|
# harder to track. When we have a rename, track that the files are
|
|
# equivalent; i.e. that they refer to different versions of same file.
|
|
oldname, newname = change.filename
|
|
old_tuple = args.stats['equivalence'].get(oldname, ())
|
|
if newname in old_tuple:
|
|
continue
|
|
if old_tuple:
|
|
new_tuple = tuple(list(old_tuple)+[newname])
|
|
else:
|
|
new_tuple = (oldname, newname)
|
|
for f in new_tuple:
|
|
args.stats['equivalence'][f] = new_tuple
|
|
# Note, we require that we get an 'M' for every 'R' since the rename
|
|
# comes without information about sha1sum. So we can handle setting
|
|
# a few things for newname in the 'M' section below.
|
|
elif change.type == 'M':
|
|
args.stats['names'][change.blob_id].add(change.filename)
|
|
args.stats['allnames'].add(change.filename)
|
|
# If we get an 'M', clearly the file isn't deleted anymore
|
|
equiv = equiv_class(change.filename)
|
|
for f in equiv:
|
|
args.stats['deletions'].pop(f, None)
|
|
# If we get an 'M' for a file that wasn't the latest in a rename chain,
|
|
# then that equivalence class isn't valid anymore.
|
|
if equiv[-1] != change.filename:
|
|
for f in equiv:
|
|
if f in args.stats['equivalence']:
|
|
del args.stats['equivalence'][f]
|
|
else:
|
|
raise SystemExit("Unhandled change type: {}".format(change.type))
|
|
|
|
# We're just gathering data; don't spend time dumping the commit
|
|
commit.dumped = 2
|
|
|
|
def gather_data(args):
|
|
# Get sizes of blobs by sha1
|
|
cf = subprocess.Popen('git cat-file --batch-check --batch-all-objects'.split(),
|
|
stdout = subprocess.PIPE)
|
|
size = {}
|
|
for line in cf.stdout:
|
|
sha, objtype, shasize = line.split()
|
|
shasize = int(shasize)
|
|
if objtype == 'blob':
|
|
size[sha] = shasize
|
|
stats = {'names': collections.defaultdict(set),
|
|
'allnames' : set(),
|
|
'deletions': {},
|
|
'equivalence': {},
|
|
'size': size}
|
|
|
|
# Setup the fast-export process
|
|
fep_cmd = ['git', 'fast-export',
|
|
'-M',
|
|
'--no-data',
|
|
'--show-original-ids',
|
|
'--always-show-modify-after-rename',
|
|
'--signed-tags=strip',
|
|
'--tag-of-filtered-object=rewrite',
|
|
'--use-done-feature'] + args.revisions
|
|
fep = subprocess.Popen(fep_cmd, stdout=subprocess.PIPE)
|
|
input = fep.stdout
|
|
output = open(os.devnull, 'w')
|
|
|
|
# Create and run the filter
|
|
setattr(args, 'size', size)
|
|
setattr(args, 'stats', stats)
|
|
analyze_filter = FastExportFilter(
|
|
commit_callback = lambda c : analyze_commit(args, c),
|
|
)
|
|
analyze_filter.run(input, output, quiet = args.quiet)
|
|
setattr(args, 'num_commits', analyze_filter.num_commits_parsed())
|
|
|
|
# Close the output, ensure fast-export have completed
|
|
output.close()
|
|
if fep.wait():
|
|
raise SystemExit("Error: fast-export failed; see above.")
|
|
cf.wait()
|
|
|
|
def do_analysis(args, git_dir):
|
|
# Create the report file as necessary
|
|
results_tmp_dir = os.path.join(git_dir, 'filter-repo')
|
|
if not os.path.isdir(results_tmp_dir):
|
|
os.mkdir(results_tmp_dir)
|
|
reportfile = os.path.join(results_tmp_dir,
|
|
"repo-analysis-{}.txt".format(time.strftime("%F")))
|
|
if not args.force and os.path.isfile(reportfile):
|
|
raise SystemExit("Error: {} already exists; refusing to overwrite!".
|
|
format(reportfile))
|
|
|
|
# Now gather the data we need
|
|
gather_data(args)
|
|
|
|
def datestr(datetimeobj):
|
|
return datetimeobj.strftime('%F') if datetimeobj else '<present>'
|
|
|
|
def dirnames(path):
|
|
while True:
|
|
path = os.path.dirname(path)
|
|
yield path
|
|
if path == '':
|
|
break
|
|
|
|
# Compute aggregate unpacked size information for paths, extensions, and dirs
|
|
total_size = 0
|
|
path_size = collections.defaultdict(int)
|
|
ext_size = collections.defaultdict(int)
|
|
dir_size = collections.defaultdict(int)
|
|
for sha in args.stats['names']:
|
|
size = args.size[sha]
|
|
for name in args.stats['names'][sha]:
|
|
total_size += size
|
|
path_size[name] += size
|
|
basename, ext = os.path.splitext(name)
|
|
ext_size[ext] += size
|
|
for dirname in dirnames(name):
|
|
dir_size[dirname] += size
|
|
|
|
# Determine if and when extensions and directories were deleted
|
|
ext_deleted_data = {}
|
|
dir_deleted_data = {}
|
|
for name in args.stats['allnames']:
|
|
when = args.stats['deletions'].get(name, None)
|
|
|
|
# Update the extension
|
|
basename, ext = os.path.splitext(name)
|
|
if when is None:
|
|
ext_deleted_data[ext] = None
|
|
elif ext in ext_deleted_data:
|
|
if ext_deleted_data[ext] is not None:
|
|
ext_deleted_data[ext] = max(ext_deleted_data[ext], when)
|
|
else:
|
|
ext_deleted_data[ext] = when
|
|
|
|
# Update the dirs
|
|
for dirname in dirnames(name):
|
|
if when is None:
|
|
dir_deleted_data[dirname] = None
|
|
elif dirname in dir_deleted_data:
|
|
if dir_deleted_data[dirname] is not None:
|
|
dir_deleted_data[dirname] = max(dir_deleted_data[dirname], when)
|
|
else:
|
|
dir_deleted_data[dirname] = when
|
|
|
|
with open(reportfile, 'w') as f:
|
|
# Give a basic overview of this file
|
|
f.write("== Table of Contents ==\n")
|
|
f.write(" * Overal Statistics\n")
|
|
f.write(" * Caveats\n")
|
|
f.write(" * File renames\n")
|
|
f.write(" * Directory sizes\n")
|
|
f.write(" * Deleted directories\n")
|
|
f.write(" * All directories\n")
|
|
f.write(" * Filename extension sizes\n")
|
|
f.write(" * Deleted extensions\n")
|
|
f.write(" * All extensions\n")
|
|
f.write(" * Path sizes (accumulated across commits)\n")
|
|
f.write(" * Deleted paths\n")
|
|
f.write(" * All paths\n")
|
|
f.write(" * Files by sha and associated pathnames\n")
|
|
f.write("\n")
|
|
|
|
# Provide total unpacked size
|
|
f.write("== Overal Statistics ==\n")
|
|
f.write(" Number of commits: {}\n".format(args.num_commits))
|
|
f.write(" Number of filenames: {}\n".format(len(path_size)))
|
|
f.write(" Number of directories: {}\n".format(len(dir_size)))
|
|
f.write(" Number of file extensions: {}\n".format(len(ext_size)))
|
|
f.write("\n")
|
|
f.write(" Total unpacked size: {}\n".format(total_size))
|
|
f.write("\n")
|
|
f.write(" (Unpacked size represents what size your repository would be\n")
|
|
f.write(" if no trees, commits, tags, or other metadata were included\n")
|
|
f.write(" AND if no files were packed; i.e., without delta-ing and\n")
|
|
f.write(" without compression.)\n")
|
|
f.write("\n")
|
|
|
|
# Mention issues with the report
|
|
f.write("== Caveats ==\n")
|
|
f.write("=== Deletions ===\n")
|
|
f.write(textwrap.dedent("""
|
|
Whether a file is deleted is not a binary quality, since it can be
|
|
deleted on some branches but still exist in others. Also, it might
|
|
exist in an old tag, but have been deleted in versions newer than
|
|
that. More thorough tracking could be done, including looking at
|
|
merge commits where one side of history deleted and the other modified,
|
|
in order to give a more holistic picture of deletions. However, that
|
|
algorithm would not only be more complex to implement, it'd also be
|
|
quite difficult to present and interpret by users. Since --analyze
|
|
is just about getting a high-level rough picture of history, it instead
|
|
implements the simplistic rule that is good enough for 98% of cases:
|
|
A file is marked as deleted if the last commit in the fast-export
|
|
stream that mentions the file lists it as deleted.
|
|
This makes it dependent on topological ordering, but generally gives
|
|
the "right" answer.
|
|
"""[1:]))
|
|
f.write("=== Renames ===\n")
|
|
f.write(textwrap.dedent("""
|
|
Renames share the same non-binary nature that deletions do, plus
|
|
additional challenges:
|
|
* If the renamed file is renamed again, instead of just two names for
|
|
a path you can have three or more.
|
|
* Rename pairs of the form (oldname, newname) that we consider to be
|
|
different names of the "same file" might only be valid over certain
|
|
commit ranges. For example, if a new commit reintroduces a file
|
|
named oldname, then new versions of oldname aren't the "same file"
|
|
anymore. We could try to portray this to the user, but it's easier
|
|
for the user to just break the pairing and only report unbroken
|
|
rename pairings to the user.
|
|
* Since modifying a renamed file on the side of history that doesn't
|
|
rename it should be expected to be common (unlike modifying a deleted
|
|
file on the side of history that doesn't delete it), tracking history
|
|
becomes more important to avoid incorrectly breaking rename chains.
|
|
This has not yet been implemented. This seriously raises the risk
|
|
of erroneously breaking rename pairings; a future release may address
|
|
this shortcoming.
|
|
* We only use rename detection, not copy detection. However, that
|
|
means that if some commit in history renamed two files into the same
|
|
location, we won't pick up one of the two renames and will instead
|
|
report that branch as having been deleted.
|
|
* The ability for users to rename files differently in different
|
|
branches means that our chains of renames will not necessarily be
|
|
linear but may branch out.
|
|
"""[1:]))
|
|
f.write("\n")
|
|
|
|
# Equivalence classes for names, so if folks only want to keep a
|
|
# certain set of paths, they know the old names they want to include
|
|
# too.
|
|
f.write("== File renames ==\n")
|
|
seen = set()
|
|
for pathname,equiv_group in sorted(args.stats['equivalence'].iteritems(),
|
|
key=lambda x:x[1]):
|
|
if equiv_group in seen:
|
|
continue
|
|
seen.add(equiv_group)
|
|
f.write(" {} ->\n ".format(equiv_group[0]) +
|
|
"\n ".join(equiv_group[1:]) +
|
|
"\n")
|
|
f.write("\n")
|
|
|
|
# List directories in reverse sorted order of unpacked size
|
|
f.write("== Directory sizes ==\n")
|
|
f.write("=== Deleted directories by reverse size ===\n")
|
|
f.write("Format: size (bytes), date deleted, directory name\n")
|
|
for dirname, size in sorted(dir_size.iteritems(),
|
|
key=lambda x:x[1], reverse=True):
|
|
if (dir_deleted_data[dirname]):
|
|
f.write(" {:10d} {:10s} {}\n".format(size,
|
|
datestr(dir_deleted_data[dirname]),
|
|
dirname or '<toplevel>'))
|
|
f.write("\n")
|
|
f.write("=== All directories by reverse size ===\n")
|
|
f.write("Format: size (bytes), date deleted, directory name\n")
|
|
for dirname, size in sorted(dir_size.iteritems(),
|
|
key=lambda x:x[1], reverse=True):
|
|
f.write(" {:10d} {:10s} {}\n".format(size, datestr(dir_deleted_data[dirname]),
|
|
dirname or '<toplevel>'))
|
|
f.write("\n")
|
|
|
|
# List extensions in reverse sorted order of unpacked size
|
|
f.write("== Filename extension sizes ==\n")
|
|
f.write("=== Deleted extensions by reverse size ===\n")
|
|
f.write("Format: size (bytes), date deleted, extension name\n")
|
|
for extname, size in sorted(ext_size.iteritems(),
|
|
key=lambda x:x[1], reverse=True):
|
|
if (ext_deleted_data[extname]):
|
|
f.write(" {:10d} {:10s} {}\n".format(size,
|
|
datestr(ext_deleted_data[extname]),
|
|
extname or '<no extension>'))
|
|
f.write("\n")
|
|
f.write("=== All extensions by reverse size ===\n")
|
|
f.write("Format: size (bytes), date deleted, extension name\n")
|
|
for extname, size in sorted(ext_size.iteritems(),
|
|
key=lambda x:x[1], reverse=True):
|
|
f.write(" {:10d} {:10s} {}\n".format(size,
|
|
datestr(ext_deleted_data[extname]),
|
|
extname or '<no extension>'))
|
|
f.write("\n")
|
|
|
|
# List files in reverse sorted order of unpacked size
|
|
f.write("== Path sizes (accumulated across commits) ==\n")
|
|
f.write("=== Deleted paths by reverse size ===\n")
|
|
f.write("Format: size (bytes), date deleted, path name(s)\n")
|
|
for pathname, size in sorted(path_size.iteritems(),
|
|
key=lambda x:x[1], reverse=True):
|
|
when = args.stats['deletions'].get(pathname, None)
|
|
if when:
|
|
f.write(" {:10d} {:10s} {}\n".format(size, datestr(when), pathname))
|
|
f.write("\n")
|
|
f.write("=== All paths by reverse size ===\n")
|
|
f.write("Format: size (bytes), date deleted, pathectory name\n")
|
|
for pathname, size in sorted(path_size.iteritems(),
|
|
key=lambda x:x[1], reverse=True):
|
|
when = args.stats['deletions'].get(pathname, None)
|
|
f.write(" {:10d} {:10s} {}\n".format(size, datestr(when), pathname))
|
|
f.write("\n")
|
|
|
|
# List of filenames and sizes in descending order
|
|
f.write("== Files by sha and associated pathnames in reverse size ==\n")
|
|
f.write("Format: sha, size (bytes), filename(s) object stored as\n")
|
|
for sha, size in sorted(args.size.iteritems(), key=lambda x:x[1],
|
|
reverse=True):
|
|
if sha not in args.stats['names']:
|
|
# Some objects in the repository might not be referenced, or not
|
|
# referenced by the branches/tags the user cares about; skip them.
|
|
continue
|
|
names_with_sha = args.stats['names'][sha]
|
|
if len(names_with_sha) == 1:
|
|
names_with_sha = names_with_sha.pop()
|
|
else:
|
|
names_with_sha = sorted(list(names_with_sha))
|
|
f.write(" {} {:9d} {}\n".format(sha, size, names_with_sha))
|
|
f.write("\n")
|
|
print("Report written to {}".format(reportfile))
|
|
|
|
def tweak_commit(args, commit):
|
|
def filename_matches(path_expression, pathname):
|
|
if path_expression == '':
|
|
return True
|
|
n = len(path_expression)
|
|
if (pathname.startswith(path_expression) and
|
|
(path_expression[n-1] == '/' or
|
|
len(pathname) == n or
|
|
pathname[n] == '/')):
|
|
return True
|
|
return False
|
|
|
|
def newname(path_changes, pathname, filtering_is_inclusive):
|
|
wanted = False
|
|
for (mod_type, match_type, path_expression) in path_changes:
|
|
if mod_type == 'filter' and not wanted:
|
|
if match_type == 'match' and filename_matches(path_expression, pathname):
|
|
wanted = True
|
|
if match_type == 'glob' and fnmatch.fnmatch(pathname, path_expression):
|
|
wanted = True
|
|
if match_type == 'regex' and re.search(path_expression, pathname):
|
|
wanted = True
|
|
elif mod_type == 'rename':
|
|
old_exp, new_exp = path_expression.split(':')
|
|
if match_type == 'prefix' and pathname.startswith(old_exp):
|
|
pathname = pathname.replace(old_exp, new_exp, 1)
|
|
return pathname if (wanted == filtering_is_inclusive) else None
|
|
|
|
new_file_changes = {}
|
|
for change in commit.file_changes:
|
|
change.filename = newname(args.path_changes, change.filename, args.inclusive)
|
|
if not change.filename:
|
|
continue # Filtering criteria excluded this file; move on to next one
|
|
if change.filename in new_file_changes:
|
|
# Getting here means that path renaming is in effect, and caused one
|
|
# path to collide with another. That's usually bad, but sometimes
|
|
# people have a file named OLDFILE in old revisions of history, and they
|
|
# rename to NEWFILE, and would like to rewrite history so that all
|
|
# revisions refer to it as NEWFILE. As such, we can allow a collision
|
|
# when (at least) one of the two paths is a deletion. Note that if
|
|
# OLDFILE and NEWFILE are unrelated this also allows the rewrite to
|
|
# continue, which makes sense since OLDFILE is no longer in the way.
|
|
if change.type == 'D':
|
|
# We can just throw this one away and keep the other
|
|
continue
|
|
elif new_file_changes[change.filename].type != 'D':
|
|
raise SystemExit("File renaming caused colliding pathnames!\n" +
|
|
" Commit: {}\n".format(commit.original_id) +
|
|
" Filename: {}".format(change.filename))
|
|
new_file_changes[change.filename] = change
|
|
commit.file_changes = new_file_changes.values()
|
|
|
|
class InputFileBackup:
|
|
def __init__(self, input_file, output_file):
|
|
self.input_file = input_file
|
|
self.output_file = output_file
|
|
|
|
def read(self, size):
|
|
output = self.input_file.read(size)
|
|
self.output_file.write(output)
|
|
return output
|
|
|
|
def readline(self):
|
|
line = self.input_file.readline()
|
|
self.output_file.write(line)
|
|
return line
|
|
|
|
class DualFileWriter:
|
|
def __init__(self, file1, file2):
|
|
self.file1 = file1
|
|
self.file2 = file2
|
|
|
|
def write(self, *args):
|
|
self.file1.write(*args)
|
|
self.file2.write(*args)
|
|
|
|
def close(self):
|
|
self.file1.close()
|
|
self.file2.close()
|
|
|
|
def run_fast_filter():
|
|
args = get_args()
|
|
if args.debug:
|
|
print("[DEBUG] Parsed arguments:\n{}".format(args))
|
|
|
|
# Determine basic repository information
|
|
orig_refs = get_refs()
|
|
is_bare = is_repository_bare()
|
|
git_dir = determine_git_dir()
|
|
|
|
# Do analysis, if requested
|
|
if args.analyze:
|
|
do_analysis(args, git_dir)
|
|
return
|
|
|
|
# Do sanity checks
|
|
if not args.force:
|
|
sanity_check(orig_refs, is_bare)
|
|
|
|
# Create a temporary directory for storing some results
|
|
results_tmp_dir = os.path.join(git_dir, 'filter-repo')
|
|
if not os.path.isdir(results_tmp_dir):
|
|
os.mkdir(results_tmp_dir)
|
|
|
|
# Determine where to get input (and whether to make a copy)
|
|
if args.stdin:
|
|
input = sys.stdin
|
|
fe_orig = None
|
|
else:
|
|
fep_cmd = ['git', 'fast-export',
|
|
'--show-original-ids',
|
|
'--signed-tags=strip',
|
|
'--tag-of-filtered-object=rewrite',
|
|
'--no-data',
|
|
'--use-done-feature'] + args.revisions
|
|
fep = subprocess.Popen(fep_cmd, stdout=subprocess.PIPE)
|
|
input = fep.stdout
|
|
if args.dry_run or args.debug:
|
|
fe_orig = os.path.join(results_tmp_dir, 'fast-export.original')
|
|
output = open(fe_orig, 'w')
|
|
input = InputFileBackup(input, output)
|
|
if args.debug:
|
|
print("[DEBUG] Running: {}".format(' '.join(fep_cmd)))
|
|
print(" (saving a copy of the output at {})".format(fe_orig))
|
|
|
|
# Determine where to send output
|
|
pipes = None
|
|
if not args.dry_run:
|
|
fip_cmd = 'git fast-import --force --quiet'.split()
|
|
fip = subprocess.Popen(fip_cmd, stdin=subprocess.PIPE,
|
|
stdout=subprocess.PIPE)
|
|
pipes = (fip.stdin, fip.stdout)
|
|
if args.dry_run or args.debug:
|
|
fe_filt = os.path.join(results_tmp_dir, 'fast-export.filtered')
|
|
output = open(fe_filt, 'w')
|
|
else:
|
|
output = fip.stdin
|
|
if args.debug:
|
|
output = DualFileWriter(fip.stdin, output)
|
|
print("[DEBUG] Running: {}".format(' '.join(fip_cmd)))
|
|
print(" (using the following file as input: {})".format(fe_filt))
|
|
|
|
# Create and run the filter
|
|
filter = FastExportFilter(
|
|
commit_callback = lambda c : tweak_commit(args, c),
|
|
)
|
|
filter.run(input, output, fast_import_pipes = pipes, quiet = args.quiet)
|
|
|
|
# Close the output, ensure fast-export and fast-import have completed
|
|
output.close()
|
|
if not args.stdin and fep.wait():
|
|
raise SystemExit("Error: fast-export failed; see above.")
|
|
if not args.dry_run and fip.wait():
|
|
raise SystemExit("Error: fast-import failed; see above.")
|
|
|
|
# Exit early
|
|
if args.dry_run:
|
|
orig_str = "by comparing:\n "+fe_orig if fe_orig else "at:"
|
|
print("NOTE: Not running fast-import or cleaning up; --dry-run passed.")
|
|
print(" Requested filtering can be seen {}".format(orig_str))
|
|
print(" " + fe_filt)
|
|
sys.exit(0)
|
|
|
|
# Remove unused refs
|
|
refs_to_nuke = set(orig_refs) - set(filter.get_seen_refs())
|
|
if refs_to_nuke:
|
|
if args.debug:
|
|
print("[DEBUG] Deleting the following refs:\n "+
|
|
"\n ".join(refs_to_nuke))
|
|
p = subprocess.Popen('git update-ref --stdin'.split(),
|
|
stdin=subprocess.PIPE)
|
|
p.stdin.write(''.join(["option no-deref\ndelete {}\n".format(x)
|
|
for x in refs_to_nuke]))
|
|
p.stdin.close()
|
|
if p.wait():
|
|
raise SystemExit("git update-ref failed; see above")
|
|
|
|
# Write out data about run
|
|
filter.record_metadata(results_tmp_dir, orig_refs, refs_to_nuke)
|
|
|
|
# Nuke the reflogs and repack
|
|
if not args.quiet and not args.debug:
|
|
print("Repacking your repo and cleaning out old unneeded objects")
|
|
quiet_flags = '--quiet' if args.quiet else ''
|
|
cleanup_cmds = ['git reflog expire --expire=now --all'.split(),
|
|
'git gc {} --prune=now'.format(quiet_flags).split()]
|
|
if not is_bare:
|
|
cleanup_cmds.append('git reset {} --hard'.format(quiet_flags).split())
|
|
for cmd in cleanup_cmds:
|
|
if args.debug:
|
|
print("[DEBUG] Running: {}".format(' '.join(cmd)))
|
|
subprocess.call(cmd)
|
|
|
|
if __name__ == '__main__':
|
|
run_fast_filter()
|