mirror of
https://github.com/newren/git-filter-repo.git
synced 2024-07-04 01:15:41 +02:00
filter-repo: add a --state-branch option for incremental exporting
Allow folks to periodically update the export of a live repo without re-exporting from the beginning. This is a performance improvement, but can also be important for collaboration. For example, for sensitivity reasons, folks might want to export a subset of a repo and update the export periodically. While this could be done by just re-exporting the repository anew each time, there is a risk that the paths used to specify the wanted subset might need to change in the future; making the user verify that their paths (including globs or regexes) don't also pick up anything from history that was previously excluded so that they don't get a divergent history is not very user friendly. Allowing them to just export stuff that is new since the last export works much better for them. Signed-off-by: Elijah Newren <newren@gmail.com>
This commit is contained in:
parent
b5b0cf4230
commit
71bb8d26a9
|
@ -905,7 +905,7 @@ sequence that more accurately reflects what filter-repo runs is:
|
|||
1. Verify we're in a fresh clone
|
||||
2. `git fetch -u . refs/remotes/origin/*:refs/heads/*`
|
||||
3. `git remote rm origin`
|
||||
4. `git fast-export --show-original-ids --reference-excluded-parents --fake-missing-tagger --signed-tags=strip --tag-of-filtered-object=rewrite --use-done-feature --no-data --reencode=yes --all | filter | git fast-import --force --quiet`
|
||||
4. `git fast-export --show-original-ids --reference-excluded-parents --fake-missing-tagger --signed-tags=strip --tag-of-filtered-object=rewrite --use-done-feature --no-data --reencode=yes --mark-tags --all | filter | git fast-import --force --quiet`
|
||||
5. `git update-ref --no-deref --stdin`, fed with a list of refs to nuke, and a list of replace refs to delete, create, or update.
|
||||
6. `git reset --hard`
|
||||
7. `git reflog expire --expire=now --all`
|
||||
|
|
150
git-filter-repo
150
git-filter-repo
|
@ -51,6 +51,7 @@ __all__ = ["Blob", "Reset", "FileChange", "Commit", "Tag", "Progress",
|
|||
"record_id_rename", "GitUtils", "FilteringOptions", "RepoFilter"]
|
||||
|
||||
deleted_hash = b'0'*40
|
||||
write_marks = True
|
||||
|
||||
def gettext_poison(msg):
|
||||
if "GIT_TEST_GETTEXT_POISON" in os.environ: # pragma: no cover
|
||||
|
@ -712,7 +713,7 @@ class Commit(_GitElementWithId):
|
|||
_SKIPPED_COMMITS.add(self.old_id or self.id)
|
||||
_GitElementWithId.skip(self, new_id)
|
||||
|
||||
class Tag(_GitElement):
|
||||
class Tag(_GitElementWithId):
|
||||
"""
|
||||
This class defines our representation of annotated tag elements.
|
||||
"""
|
||||
|
@ -720,7 +721,8 @@ class Tag(_GitElement):
|
|||
def __init__(self, ref, from_ref,
|
||||
tagger_name, tagger_email, tagger_date, tag_msg,
|
||||
original_id = None):
|
||||
_GitElement.__init__(self)
|
||||
_GitElementWithId.__init__(self)
|
||||
self.old_id = self.id
|
||||
|
||||
# Denote that this is a tag element
|
||||
self.type = 'tag'
|
||||
|
@ -754,6 +756,8 @@ class Tag(_GitElement):
|
|||
self.dumped = 1
|
||||
|
||||
file_.write(b'tag %s\n' % self.ref)
|
||||
if (write_marks and self.id):
|
||||
file_.write(b'mark :%d\n' % self.id)
|
||||
markfmt = b'from :%d\n' if isinstance(self.from_ref, int) else 'from %s\n'
|
||||
file_.write(markfmt % self.from_ref)
|
||||
if self.tagger_name:
|
||||
|
@ -834,6 +838,29 @@ class LiteralCommand(_GitElement):
|
|||
|
||||
file_.write(self.line)
|
||||
|
||||
class Alias(_GitElement):
|
||||
"""
|
||||
This class defines our representation of fast-import alias elements. An
|
||||
alias element is the setting of one mark to the same sha1sum as another,
|
||||
usually because the newer mark corresponded to a pruned commit.
|
||||
"""
|
||||
|
||||
def __init__(self, ref, to_ref):
|
||||
_GitElement.__init__(self)
|
||||
# Denote that this is a reset
|
||||
self.type = 'alias'
|
||||
|
||||
self.ref = ref
|
||||
self.to_ref = to_ref
|
||||
|
||||
def dump(self, file_):
|
||||
"""
|
||||
Write this reset element to a file
|
||||
"""
|
||||
self.dumped = 1
|
||||
|
||||
file_.write(b'alias\nmark :%d\nto :%d\n\n' % (self.ref, self.to_ref))
|
||||
|
||||
class FastExportParser(object):
|
||||
"""
|
||||
A class for parsing and handling the output from fast-export. This
|
||||
|
@ -1233,6 +1260,7 @@ class FastExportParser(object):
|
|||
# Parse the Tag
|
||||
tag = self._parse_ref_line(b'tag')
|
||||
self._exported_refs.add(b'refs/tags/'+tag)
|
||||
id_ = self._parse_optional_mark()
|
||||
ignoreme, from_ref = self._parse_optional_parent_ref(b'from')
|
||||
|
||||
original_id = None
|
||||
|
@ -1251,6 +1279,12 @@ class FastExportParser(object):
|
|||
tagger_name, tagger_email, tagger_date, tag_msg,
|
||||
original_id)
|
||||
|
||||
# If fast-export text had a mark for this tag, need to make sure this
|
||||
# mark translates to the tag's true id.
|
||||
if id_:
|
||||
tag.old_id = id_
|
||||
_IDS.record_rename(id_, tag.id)
|
||||
|
||||
# Call any user callback to allow them to modify the tag
|
||||
if self._tag_callback:
|
||||
self._tag_callback(tag)
|
||||
|
@ -1808,6 +1842,23 @@ EXAMPLES
|
|||
"performed and commands being run. When used together "
|
||||
"with --dry-run, also show extra information about what "
|
||||
"would be run."))
|
||||
# WARNING: --state-branch has some problems:
|
||||
# * It does not work well with manually inserted objects (user creating
|
||||
# Blob() or Commit() or Tag() objects and calling
|
||||
# RepoFilter.insert(obj) on them).
|
||||
# * It does not work well with multiple source or multiple target repos
|
||||
# * It doesn't work so well with pruning become-empty commits (though
|
||||
# --refs doesn't work so well with it either)
|
||||
# These are probably fixable, given some work (e.g. re-importing the
|
||||
# graph at the beginning to get the AncestryGraph right, doing our own
|
||||
# export of marks instead of using fast-export --export-marks, etc.), but
|
||||
# for now just hide the option.
|
||||
misc.add_argument('--state-branch',
|
||||
#help=_("Enable incremental filtering by saving the mapping of old "
|
||||
# "to new objects to the specified branch upon exit, and"
|
||||
# "loading that mapping from that branch (if it exists) "
|
||||
# "upon startup."))
|
||||
help=argparse.SUPPRESS)
|
||||
misc.add_argument('--stdin', action='store_true',
|
||||
help=_("Instead of running `git fast-export` and filtering its "
|
||||
"output, filter the fast-export stream from stdin. The "
|
||||
|
@ -1846,6 +1897,12 @@ EXAMPLES
|
|||
stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
||||
p.wait()
|
||||
output = p.stdout.read()
|
||||
if b'--mark-tags' not in output: # pragma: no cover
|
||||
global write_marks
|
||||
write_marks = False
|
||||
if args.state_branch:
|
||||
raise SystemExit(_("Error: need a version of git whose fast-export "
|
||||
"command has the --mark-tags option"))
|
||||
if b'--reencode' not in output: # pragma: no cover
|
||||
if args.preserve_commit_encoding:
|
||||
raise SystemExit(_("Error: need a version of git whose fast-export "
|
||||
|
@ -3136,7 +3193,10 @@ class RepoFilter(object):
|
|||
|
||||
# Record ancestry graph
|
||||
parents, orig_parents = commit.parents, aux_info['orig_parents']
|
||||
external_parents = [p for p in parents if not isinstance(p, int)]
|
||||
if self._args.state_branch:
|
||||
external_parents = parents
|
||||
else:
|
||||
external_parents = [p for p in parents if not isinstance(p, int)]
|
||||
self._graph.record_external_commits(external_parents)
|
||||
self._orig_graph.record_external_commits(external_parents)
|
||||
self._graph.add_commit_and_parents(commit.id, parents)
|
||||
|
@ -3159,6 +3219,9 @@ class RepoFilter(object):
|
|||
else:
|
||||
rewrite_to = new_1st_parent or commit.first_parent()
|
||||
commit.skip(new_id = rewrite_to)
|
||||
if self._args.state_branch:
|
||||
alias = Alias(commit.old_id or commit.id, rewrite_to or deleted_hash)
|
||||
self._insert_into_stream(alias)
|
||||
reset = Reset(commit.branch, rewrite_to or deleted_hash)
|
||||
self._insert_into_stream(reset)
|
||||
self._commit_renames[commit.original_id] = None
|
||||
|
@ -3223,6 +3286,64 @@ class RepoFilter(object):
|
|||
os.mkdir(d)
|
||||
return d
|
||||
|
||||
def _load_marks_file(self, marks_basename):
|
||||
full_branch = 'refs/heads/{}'.format(self._args.state_branch)
|
||||
marks_file = os.path.join(self.results_tmp_dir(), marks_basename)
|
||||
working_dir = self._args.target or b'.'
|
||||
cmd = ['git', '-C', working_dir, 'show-ref', full_branch]
|
||||
contents = b''
|
||||
if subprocess.call(cmd, stdout=subprocess.DEVNULL) == 0:
|
||||
cmd = ['git', '-C', working_dir, 'show',
|
||||
'%s:%s' % (full_branch, decode(marks_basename))]
|
||||
try:
|
||||
contents = subprocess.check_output(cmd)
|
||||
except subprocess.CalledProcessError as e: # pragma: no cover
|
||||
raise SystemExit(_("Failed loading %s from %s") %
|
||||
(decode(marks_basename), branch))
|
||||
if contents:
|
||||
biggest_id = max(int(x.split()[0][1:]) for x in contents.splitlines())
|
||||
_IDS._next_id = max(_IDS._next_id, biggest_id+1)
|
||||
with open(marks_file, 'bw') as f:
|
||||
f.write(contents)
|
||||
return marks_file
|
||||
|
||||
def _save_marks_files(self):
|
||||
basenames = [b'source-marks', b'target-marks']
|
||||
working_dir = self._args.target or b'.'
|
||||
|
||||
# Check whether the branch exists
|
||||
parent = []
|
||||
full_branch = 'refs/heads/{}'.format(self._args.state_branch)
|
||||
cmd = ['git', '-C', working_dir, 'show-ref', full_branch]
|
||||
if subprocess.call(cmd, stdout=subprocess.DEVNULL) == 0:
|
||||
parent = ['-p', full_branch]
|
||||
|
||||
# Run 'git hash-object $MARKS_FILE' for each marks file, save result
|
||||
blob_hashes = {}
|
||||
for marks_basename in basenames:
|
||||
marks_file = os.path.join(self.results_tmp_dir(), marks_basename)
|
||||
if not os.path.isfile(marks_file): # pragma: no cover
|
||||
raise SystemExit(_("Failed to find %s to save to %s")
|
||||
% (marks_file, self._args.state_branch))
|
||||
cmd = ['git', '-C', working_dir, 'hash-object', '-w', marks_file]
|
||||
blob_hashes[marks_basename] = subprocess.check_output(cmd).strip()
|
||||
|
||||
# Run 'git mktree' to create a tree out of it
|
||||
p = subprocess.Popen(['git', '-C', working_dir, 'mktree'],
|
||||
stdin=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
for b in basenames:
|
||||
p.stdin.write(b'100644 blob %s\t%s\n' % (blob_hashes[b], b))
|
||||
p.stdin.close()
|
||||
p.wait()
|
||||
tree = p.stdout.read().strip()
|
||||
|
||||
# Create the new commit
|
||||
cmd = (['git', '-C', working_dir, 'commit-tree', '-m', 'New mark files',
|
||||
tree] + parent)
|
||||
commit = subprocess.check_output(cmd).strip()
|
||||
subprocess.call(['git', '-C', working_dir, 'update-ref',
|
||||
full_branch, commit])
|
||||
|
||||
def importer_only(self):
|
||||
self._run_sanity_checks()
|
||||
self._setup_output()
|
||||
|
@ -3258,6 +3379,13 @@ class RepoFilter(object):
|
|||
self._unpacked_size, packed_size = GitUtils.get_blob_sizes()
|
||||
if use_done_feature:
|
||||
extra_flags.append('--use-done-feature')
|
||||
if write_marks:
|
||||
extra_flags.append(b'--mark-tags')
|
||||
if self._args.state_branch:
|
||||
assert(write_marks)
|
||||
source_marks_file = self._load_marks_file(b'source-marks')
|
||||
extra_flags.extend([b'--export-marks='+source_marks_file,
|
||||
b'--import-marks='+source_marks_file])
|
||||
if self._args.preserve_commit_encoding is not None: # pragma: no cover
|
||||
reencode = 'no' if self._args.preserve_commit_encoding else 'yes'
|
||||
extra_flags.append('--reencode='+reencode)
|
||||
|
@ -3274,8 +3402,7 @@ class RepoFilter(object):
|
|||
output = open(self._fe_orig, 'bw')
|
||||
self._input = InputFileBackup(self._input, output)
|
||||
if self._args.debug:
|
||||
tmp = fep_cmd.copy()
|
||||
tmp[2] = decode(tmp[2]) if isinstance(tmp[2], bytes) else tmp[2]
|
||||
tmp = [decode(x) if isinstance(x, bytes) else x for x in fep_cmd]
|
||||
print("[DEBUG] Running: {}".format(' '.join(tmp)))
|
||||
print(" (saving a copy of the output at {})"
|
||||
.format(decode(self._fe_orig)))
|
||||
|
@ -3283,7 +3410,11 @@ class RepoFilter(object):
|
|||
def _setup_output(self):
|
||||
if not self._args.dry_run:
|
||||
location = ['-C', self._args.target] if self._args.target else []
|
||||
fip_cmd = ['git'] + location + 'fast-import --force --quiet'.split()
|
||||
fip_cmd = ['git'] + location + ['fast-import', '--force', '--quiet']
|
||||
if self._args.state_branch:
|
||||
target_marks_file = self._load_marks_file(b'target-marks')
|
||||
fip_cmd.extend([b'--export-marks='+target_marks_file,
|
||||
b'--import-marks='+target_marks_file])
|
||||
self._fip = subprocess.Popen(fip_cmd,
|
||||
bufsize=-1,
|
||||
stdin=subprocess.PIPE,
|
||||
|
@ -3297,8 +3428,7 @@ class RepoFilter(object):
|
|||
self._output = self._fip.stdin
|
||||
if self._args.debug:
|
||||
self._output = DualFileWriter(self._fip.stdin, self._output)
|
||||
tmp = fip_cmd.copy()
|
||||
tmp[2] = decode(tmp[2]) if isinstance(tmp[2], bytes) else tmp[2]
|
||||
tmp = [decode(x) if isinstance(x, bytes) else x for x in fip_cmd]
|
||||
print("[DEBUG] Running: {}".format(' '.join(tmp)))
|
||||
print(" (using the following file as input: {})"
|
||||
.format(decode(self._fe_filt)))
|
||||
|
@ -3549,6 +3679,10 @@ class RepoFilter(object):
|
|||
if not self._args.dry_run and self._fip.wait():
|
||||
raise SystemExit(_("Error: fast-import failed; see above.")) # pragma: no cover
|
||||
|
||||
# With fast-export and fast-import complete, update state if requested
|
||||
if self._args.state_branch:
|
||||
self._save_marks_files()
|
||||
|
||||
# Notify user how long it took, before doing a gc and such
|
||||
repack = (not self._args.source and not self._args.target)
|
||||
msg = "New history written in {:.2f} seconds..."
|
||||
|
|
|
@ -7,6 +7,7 @@ test_description='Basic filter-repo tests'
|
|||
export PATH=$(dirname $TEST_DIRECTORY):$PATH # Put git-filter-repo in PATH
|
||||
|
||||
DATA="$TEST_DIRECTORY/t9390"
|
||||
SQ="'"
|
||||
|
||||
filter_testcase() {
|
||||
INPUT=$1
|
||||
|
@ -1210,4 +1211,102 @@ test_expect_success 'handle funny characters' '
|
|||
)
|
||||
'
|
||||
|
||||
test_expect_success '--state-branch with changing renames' '
|
||||
test_create_repo state_branch_renames_export
|
||||
test_create_repo state_branch_renames &&
|
||||
(
|
||||
cd state_branch_renames &&
|
||||
git fast-import --quiet <$DATA/basic-numbers &&
|
||||
git branch -d A &&
|
||||
git branch -d B &&
|
||||
git tag -d v1.0 &&
|
||||
|
||||
ORIG=$(git rev-parse master) &&
|
||||
git reset --hard master~1 &&
|
||||
git filter-repo --path-rename ten:zehn \
|
||||
--state-branch state_info \
|
||||
--target ../state_branch_renames_export &&
|
||||
|
||||
cd ../state_branch_renames_export &&
|
||||
git log --format=%s --name-status >actual &&
|
||||
cat <<-EOF >expect &&
|
||||
Merge branch ${SQ}A${SQ} into B
|
||||
add twenty
|
||||
|
||||
M twenty
|
||||
add ten
|
||||
|
||||
M zehn
|
||||
Initial
|
||||
|
||||
A twenty
|
||||
A zehn
|
||||
EOF
|
||||
test_cmp expect actual &&
|
||||
|
||||
cd ../state_branch_renames &&
|
||||
|
||||
git reset --hard $ORIG &&
|
||||
git filter-repo --path-rename twenty:veinte \
|
||||
--state-branch state_info \
|
||||
--target ../state_branch_renames_export &&
|
||||
|
||||
cd ../state_branch_renames_export &&
|
||||
git log --format=%s --name-status >actual &&
|
||||
cat <<-EOF >expect &&
|
||||
whatever
|
||||
|
||||
A ten
|
||||
A veinte
|
||||
Merge branch ${SQ}A${SQ} into B
|
||||
add twenty
|
||||
|
||||
M twenty
|
||||
add ten
|
||||
|
||||
M zehn
|
||||
Initial
|
||||
|
||||
A twenty
|
||||
A zehn
|
||||
EOF
|
||||
test_cmp expect actual
|
||||
)
|
||||
'
|
||||
|
||||
test_expect_success '--state-branch with expanding paths and refs' '
|
||||
test_create_repo state_branch_more_paths_export
|
||||
test_create_repo state_branch_more_paths &&
|
||||
(
|
||||
cd state_branch_more_paths &&
|
||||
git fast-import --quiet <$DATA/basic-numbers &&
|
||||
|
||||
git reset --hard master~1 &&
|
||||
git filter-repo --path ten --state-branch state_info \
|
||||
--target ../state_branch_more_paths_export \
|
||||
--refs master &&
|
||||
|
||||
cd ../state_branch_more_paths_export &&
|
||||
echo 2 >expect &&
|
||||
git rev-list --count master >actual &&
|
||||
test_cmp expect actual &&
|
||||
test_must_fail git rev-parse master~1:twenty &&
|
||||
test_must_fail git rev-parse master:twenty &&
|
||||
|
||||
cd ../state_branch_more_paths &&
|
||||
|
||||
git reset --hard v1.0 &&
|
||||
git filter-repo --path ten --path twenty \
|
||||
--state-branch state_info \
|
||||
--target ../state_branch_more_paths_export &&
|
||||
|
||||
cd ../state_branch_more_paths_export &&
|
||||
echo 3 >expect &&
|
||||
git rev-list --count master >actual &&
|
||||
test_cmp expect actual &&
|
||||
test_must_fail git rev-parse master~2:twenty &&
|
||||
git rev-parse master:twenty
|
||||
)
|
||||
'
|
||||
|
||||
test_done
|
||||
|
|
|
@ -30,7 +30,7 @@ def track_everything(obj, *_ignored):
|
|||
def assert_not_reached(x): raise SystemExit("should have been skipped!")
|
||||
obj.dump = assert_not_reached
|
||||
obj.skip()
|
||||
if hasattr(obj, 'id'):
|
||||
if hasattr(obj, 'id') and type(obj) != fr.Tag:
|
||||
# The creation of myblob should cause objects in stream to get their ids
|
||||
# increased by 1; this shouldn't be depended upon as API by external
|
||||
# projects, I'm just verifying an invariant of the current code.
|
||||
|
@ -67,7 +67,7 @@ parser.run(input = sys.stdin.detach(),
|
|||
# DO NOT depend upon or use _IDS directly you external script writers. I'm
|
||||
# only testing here for code coverage; the capacity exists to help debug
|
||||
# git-filter-repo itself, not for external folks to use.
|
||||
assert str(fr._IDS).startswith("Current count: 4")
|
||||
assert str(fr._IDS).startswith("Current count: 5")
|
||||
print("Found {} blobs/commits and {} other objects"
|
||||
.format(total_objects['common'], total_objects['uncommon']))
|
||||
|
||||
|
@ -94,6 +94,9 @@ stream = io.BytesIO(textwrap.dedent('''
|
|||
from :2
|
||||
M 100644 :1 greeting
|
||||
|
||||
reset refs/heads/B
|
||||
from :3
|
||||
|
||||
commit refs/heads/C
|
||||
mark :4
|
||||
author Just Me <just@here.org> 1234567890 -0200
|
||||
|
@ -125,4 +128,4 @@ filter._input = stream
|
|||
filter._setup_output()
|
||||
filter._sanity_checks_handled = True
|
||||
filter.run()
|
||||
assert counts == collections.Counter({fr.Blob: 1, fr.Commit: 3})
|
||||
assert counts == collections.Counter({fr.Blob: 1, fr.Commit: 3, fr.Reset: 1})
|
||||
|
|
Loading…
Reference in New Issue
Block a user