From f2729153feee13fbb42cd797937cc8fb36665674 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 19 Oct 2019 13:50:49 -0700 Subject: [PATCH] filter-repo: workaround Windows' insistence that cwd not be a bytestring Unfortunately, it appears that Windows does not allow the 'cwd' argument of various subprocess calls to be a bytestring. That may be functional on Windows since Windows-related filesystems are allowed to require that all file and directory names be valid unicode, but not all platforms enforce such restrictions. As such, I certainly cannot change cwd=directory to cwd=decode(directory) because that could break on other platforms (and perhaps even on Windows if someone is trying to read a non-native filesystem). Instead, create a SubprocessWrapper class that will always call decode on the cwd argument before passing along to the real subprocess class. Use these wrappers on Windows, and do not use them elsewhere. Signed-off-by: Elijah Newren --- git-filter-repo | 128 ++++++++++++++++++++++++++++-------------------- t/run_coverage | 5 +- 2 files changed, 78 insertions(+), 55 deletions(-) diff --git a/git-filter-repo b/git-filter-repo index 4aa4bc2..eb1146c 100755 --- a/git-filter-repo +++ b/git-filter-repo @@ -36,6 +36,7 @@ import fnmatch import gettext import io import os +import platform import re import shutil import subprocess @@ -1435,6 +1436,29 @@ _SKIPPED_COMMITS = set() HASH_TO_ID = {} ID_TO_HASH = {} +class SubprocessWrapper(object): + @staticmethod + def call(*args, **kwargs): + if 'cwd' in kwargs: + kwargs['cwd'] = decode(kwargs['cwd']) + return subprocess.call(*args, **kwargs) + + @staticmethod + def check_output(*args, **kwargs): + if 'cwd' in kwargs: + kwargs['cwd'] = decode(kwargs['cwd']) + return subprocess.check_output(*args, **kwargs) + + @staticmethod + def Popen(*args, **kwargs): + if 'cwd' in kwargs: + kwargs['cwd'] = decode(kwargs['cwd']) + return subprocess.Popen(*args, **kwargs) + +subproc = subprocess +if platform.system() == 'Windows' or 'PRETEND_UNICODE_FILENAMES' in os.environ: + subproc = SubprocessWrapper + class GitUtils(object): @staticmethod def get_commit_count(repo, *args): @@ -1445,11 +1469,11 @@ class GitUtils(object): args = ['--all'] if len(args) == 1 and isinstance(args[0], list): args = args[0] - p1 = subprocess.Popen(["git", "rev-list"] + args, - bufsize=-1, - stdout=subprocess.PIPE, stderr=subprocess.PIPE, - cwd=repo) - p2 = subprocess.Popen(["wc", "-l"], stdin=p1.stdout, stdout=subprocess.PIPE) + p1 = subproc.Popen(["git", "rev-list"] + args, + bufsize=-1, + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + cwd=repo) + p2 = subproc.Popen(["wc", "-l"], stdin=p1.stdout, stdout=subprocess.PIPE) count = int(p2.communicate()[0]) if p1.poll() != 0: raise SystemExit(_("%s does not appear to be a valid git repository") @@ -1461,7 +1485,7 @@ class GitUtils(object): """ Return the number of objects (both packed and unpacked) """ - p1 = subprocess.Popen(["git", "count-objects", "-v"], + p1 = subproc.Popen(["git", "count-objects", "-v"], stdout=subprocess.PIPE, cwd=repo) lines = p1.stdout.read().splitlines() # Return unpacked objects + packed-objects @@ -1469,14 +1493,14 @@ class GitUtils(object): @staticmethod def is_repository_bare(repo_working_dir): - out = subprocess.check_output('git rev-parse --is-bare-repository'.split(), - cwd=repo_working_dir) + out = subproc.check_output('git rev-parse --is-bare-repository'.split(), + cwd=repo_working_dir) return (out.strip() == b'true') @staticmethod def determine_git_dir(repo_working_dir): - d = subprocess.check_output('git rev-parse --git-dir'.split(), - cwd=repo_working_dir).strip() + d = subproc.check_output('git rev-parse --git-dir'.split(), + cwd=repo_working_dir).strip() if repo_working_dir==b'.' or d.startswith(b'/'): return d return os.path.join(repo_working_dir, d) @@ -1484,8 +1508,8 @@ class GitUtils(object): @staticmethod def get_refs(repo_working_dir): try: - output = subprocess.check_output('git show-ref'.split(), - cwd=repo_working_dir) + output = subproc.check_output('git show-ref'.split(), + cwd=repo_working_dir) except subprocess.CalledProcessError as e: # If error code is 1, there just aren't any refs; i.e. new repo. # If error code is other than 1, some other error (e.g. not a git repo) @@ -1502,9 +1526,9 @@ class GitUtils(object): # Get sizes of blobs by sha1 cmd = '--batch-check=%(objectname) %(objecttype) ' + \ '%(objectsize) %(objectsize:disk)' - cf = subprocess.Popen(['git', 'cat-file', '--batch-all-objects', cmd], - bufsize = -1, - stdout = subprocess.PIPE) + cf = subproc.Popen(['git', 'cat-file', '--batch-all-objects', cmd], + bufsize = -1, + stdout = subprocess.PIPE) unpacked_size = {} packed_size = {} for line in cf.stdout: @@ -1530,7 +1554,7 @@ class GitUtils(object): file_changes = [] cmd = ["git", "diff-tree", "-r", parent_hash, commit_hash] - output = subprocess.check_output(cmd, cwd=repo) + output = subproc.check_output(cmd, cwd=repo) for line in output.splitlines(): fileinfo, path = line.split(b'\t', 1) if path.startswith(b'"'): @@ -1556,7 +1580,7 @@ class GitUtils(object): br'\1@@LOCALEDIR@@"', contents) cmd = 'git hash-object --stdin'.split() - version = subprocess.check_output(cmd, input=contents).strip() + version = subproc.check_output(cmd, input=contents).strip() print(decode(version[0:12])) class FilteringOptions(object): @@ -1961,8 +1985,8 @@ EXAMPLES "incompatible.")) # Also throw some sanity checks on git version here; # PERF: remove these checks once new enough git versions are common - p = subprocess.Popen('git fast-export -h'.split(), - stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + p = subproc.Popen('git fast-export -h'.split(), + stdout=subprocess.PIPE, stderr=subprocess.STDOUT) p.wait() output = p.stdout.read() if b'--mark-tags' not in output: # pragma: no cover @@ -1982,8 +2006,8 @@ EXAMPLES args.preserve_commit_encoding = None # If we don't have fast-exoprt --reencode, we may also be missing # diff-tree --combined-all-paths, which is even more important... - p = subprocess.Popen('git diff-tree -h'.split(), - stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + p = subproc.Popen('git diff-tree -h'.split(), + stdout=subprocess.PIPE, stderr=subprocess.STDOUT) p.wait() output = p.stdout.read() if b'--combined-all-paths' not in output: @@ -2231,7 +2255,7 @@ class RepoAnalyze(object): cmd = ('git rev-list --topo-order --reverse {}'.format(' '.join(args.refs)) + ' | git diff-tree --stdin --always --root --format=%H%n%P%n%cd' + ' --date=short -M -t -c --raw --combined-all-paths') - dtp = subprocess.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE) + dtp = subproc.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE) f = dtp.stdout line = f.readline() if not line: @@ -2775,7 +2799,7 @@ class RepoFilter(object): "To override, use --force.") % reason) # Make sure repo is fully packed, just like a fresh clone would be - output = subprocess.check_output('git count-objects -v'.split()) + output = subproc.check_output('git count-objects -v'.split()) stats = dict(x.split(b': ') for x in output.splitlines()) num_packs = int(stats[b'packs']) if stats[b'count'] != b'0' or num_packs > 1: @@ -2783,7 +2807,7 @@ class RepoFilter(object): # Make sure there is precisely one remote, named "origin"...or that this # is a new bare repo with no packs and no remotes - output = subprocess.check_output('git remote'.split()).strip() + output = subproc.check_output('git remote'.split()).strip() if not (output == b"origin" or (num_packs == 0 and not output)): abort(_("expected one remote, origin")) @@ -2813,11 +2837,11 @@ class RepoFilter(object): # Do extra checks in non-bare repos if not is_bare: # Avoid uncommitted, unstaged, or untracked changes - if subprocess.call('git diff --staged --quiet'.split()): + if subproc.call('git diff --staged --quiet'.split()): abort(_("you have uncommitted changes")) - if subprocess.call('git diff --quiet'.split()): + if subproc.call('git diff --quiet'.split()): abort(_("you have unstaged changes")) - if len(subprocess.check_output('git ls-files -o'.split())) > 0: + if len(subproc.check_output('git ls-files -o'.split())) > 0: abort(_("you have untracked changes")) # Avoid unpushed changes @@ -2833,7 +2857,7 @@ class RepoFilter(object): decode(origin_ref))) # Make sure there is only one worktree - output = subprocess.check_output('git worktree list'.split()) + output = subproc.check_output('git worktree list'.split()) if len(output.splitlines()) > 1: abort(_('you have multiple worktrees')) @@ -2858,7 +2882,7 @@ class RepoFilter(object): for cmd in cleanup_cmds: if show_debuginfo: print("[DEBUG] Running{}: {}".format(location_info, ' '.join(cmd))) - subprocess.call(cmd, cwd=repo) + subproc.call(cmd, cwd=repo) def _get_rename(self, old_hash): # If we already know the rename, just return it @@ -3377,11 +3401,11 @@ class RepoFilter(object): working_dir = self._args.target or b'.' cmd = ['git', '-C', working_dir, 'show-ref', full_branch] contents = b'' - if subprocess.call(cmd, stdout=subprocess.DEVNULL) == 0: + if subproc.call(cmd, stdout=subprocess.DEVNULL) == 0: cmd = ['git', '-C', working_dir, 'show', '%s:%s' % (full_branch, decode(marks_basename))] try: - contents = subprocess.check_output(cmd) + contents = subproc.check_output(cmd) except subprocess.CalledProcessError as e: # pragma: no cover raise SystemExit(_("Failed loading %s from %s") % (decode(marks_basename), branch)) @@ -3400,7 +3424,7 @@ class RepoFilter(object): parent = [] full_branch = 'refs/heads/{}'.format(self._args.state_branch) cmd = ['git', '-C', working_dir, 'show-ref', full_branch] - if subprocess.call(cmd, stdout=subprocess.DEVNULL) == 0: + if subproc.call(cmd, stdout=subprocess.DEVNULL) == 0: parent = ['-p', full_branch] # Run 'git hash-object $MARKS_FILE' for each marks file, save result @@ -3411,11 +3435,11 @@ class RepoFilter(object): raise SystemExit(_("Failed to find %s to save to %s") % (marks_file, self._args.state_branch)) cmd = ['git', '-C', working_dir, 'hash-object', '-w', marks_file] - blob_hashes[marks_basename] = subprocess.check_output(cmd).strip() + blob_hashes[marks_basename] = subproc.check_output(cmd).strip() # Run 'git mktree' to create a tree out of it - p = subprocess.Popen(['git', '-C', working_dir, 'mktree'], - stdin=subprocess.PIPE, stdout=subprocess.PIPE) + p = subproc.Popen(['git', '-C', working_dir, 'mktree'], + stdin=subprocess.PIPE, stdout=subprocess.PIPE) for b in basenames: p.stdin.write(b'100644 blob %s\t%s\n' % (blob_hashes[b], b)) p.stdin.close() @@ -3425,9 +3449,8 @@ class RepoFilter(object): # Create the new commit cmd = (['git', '-C', working_dir, 'commit-tree', '-m', 'New mark files', tree] + parent) - commit = subprocess.check_output(cmd).strip() - subprocess.call(['git', '-C', working_dir, 'update-ref', - full_branch, commit]) + commit = subproc.check_output(cmd).strip() + subproc.call(['git', '-C', working_dir, 'update-ref', full_branch, commit]) def importer_only(self): self._run_sanity_checks() @@ -3479,7 +3502,7 @@ class RepoFilter(object): '--signed-tags=strip', '--tag-of-filtered-object=rewrite', '--fake-missing-tagger', '--reference-excluded-parents' ] + extra_flags + self._args.refs - self._fep = subprocess.Popen(fep_cmd, bufsize=-1, stdout=subprocess.PIPE) + self._fep = subproc.Popen(fep_cmd, bufsize=-1, stdout=subprocess.PIPE) self._input = self._fep.stdout if self._args.dry_run or self._args.debug: self._fe_orig = os.path.join(self.results_tmp_dir(), @@ -3500,10 +3523,8 @@ class RepoFilter(object): target_marks_file = self._load_marks_file(b'target-marks') fip_cmd.extend([b'--export-marks='+target_marks_file, b'--import-marks='+target_marks_file]) - self._fip = subprocess.Popen(fip_cmd, - bufsize=-1, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE) + self._fip = subproc.Popen(fip_cmd, bufsize=-1, + stdin=subprocess.PIPE, stdout=subprocess.PIPE) self._import_pipes = (self._fip.stdin, self._fip.stdout) if self._args.dry_run or self._args.debug: self._fe_filt = os.path.join(self.results_tmp_dir(), @@ -3526,9 +3547,8 @@ class RepoFilter(object): if self._args.debug: print("[DEBUG] Migrating refs/remotes/origin/* -> refs/heads/*") target_working_dir = self._args.target or b'.' - p = subprocess.Popen('git update-ref --no-deref --stdin'.split(), - stdin=subprocess.PIPE, - cwd=target_working_dir) + p = subproc.Popen('git update-ref --no-deref --stdin'.split(), + stdin=subprocess.PIPE, cwd=target_working_dir) for ref in refs_to_migrate: if ref == b'refs/remotes/origin/HEAD': p.stdin.write(b'delete %s %s\n' % (ref, self._orig_refs[ref])) @@ -3548,7 +3568,7 @@ class RepoFilter(object): if self._args.debug: print("[DEBUG] Removing 'origin' remote (rewritten history will no ") print(" longer be related; consider re-pushing it elsewhere.") - subprocess.call('git remote rm origin'.split(), cwd=target_working_dir) + subproc.call('git remote rm origin'.split(), cwd=target_working_dir) def _final_commands(self): self._finalize_handled = True @@ -3559,9 +3579,9 @@ class RepoFilter(object): def _ref_update(self, target_working_dir): # Start the update-ref process - p = subprocess.Popen('git update-ref --no-deref --stdin'.split(), - stdin=subprocess.PIPE, - cwd=target_working_dir) + p = subproc.Popen('git update-ref --no-deref --stdin'.split(), + stdin=subprocess.PIPE, + cwd=target_working_dir) # Remove replace_refs from _orig_refs replace_refs = {k:v for k, v in self._orig_refs.items() @@ -3636,10 +3656,10 @@ class RepoFilter(object): if not batch_check_process: cmd = 'git cat-file --batch-check'.split() target_working_dir = self._args.target or b'.' - batch_check_process = subprocess.Popen(cmd, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - cwd=target_working_dir) + batch_check_process = subproc.Popen(cmd, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + cwd=target_working_dir) batch_check_process.stdin.write(refname+b"\n") batch_check_process.stdin.flush() line = batch_check_process.stdout.readline() diff --git a/t/run_coverage b/t/run_coverage index 3abd9af..8d7e06f 100755 --- a/t/run_coverage +++ b/t/run_coverage @@ -16,7 +16,10 @@ EOF export COVERAGE_PROCESS_START=$tmpdir/.coveragerc export PYTHONPATH=$tmpdir: - +# We pretend filenames are unicode for two reasons: (1) because it exercises +# more code, and (2) this setting will detect accidental use of unicode strings +# for file/directory names when it should always be bytestrings. +export PRETEND_UNICODE_FILENAMES=1 ls t939*.sh | xargs -n 1 bash