filter-repo: workaround Windows' insistence that cwd not be a bytestring

Unfortunately, it appears that Windows does not allow the 'cwd' argument
of various subprocess calls to be a bytestring.  That may be functional
on Windows since Windows-related filesystems are allowed to require that
all file and directory names be valid unicode, but not all platforms
enforce such restrictions.  As such, I certainly cannot change
   cwd=directory
to
   cwd=decode(directory)
because that could break on other platforms (and perhaps even on Windows
if someone is trying to read a non-native filesystem).  Instead, create
a SubprocessWrapper class that will always call decode on the cwd
argument before passing along to the real subprocess class.  Use these
wrappers on Windows, and do not use them elsewhere.

Signed-off-by: Elijah Newren <newren@gmail.com>
This commit is contained in:
Elijah Newren 2019-10-19 13:50:49 -07:00
parent da2a969157
commit f2729153fe
2 changed files with 78 additions and 55 deletions

View File

@ -36,6 +36,7 @@ import fnmatch
import gettext import gettext
import io import io
import os import os
import platform
import re import re
import shutil import shutil
import subprocess import subprocess
@ -1435,6 +1436,29 @@ _SKIPPED_COMMITS = set()
HASH_TO_ID = {} HASH_TO_ID = {}
ID_TO_HASH = {} ID_TO_HASH = {}
class SubprocessWrapper(object):
@staticmethod
def call(*args, **kwargs):
if 'cwd' in kwargs:
kwargs['cwd'] = decode(kwargs['cwd'])
return subprocess.call(*args, **kwargs)
@staticmethod
def check_output(*args, **kwargs):
if 'cwd' in kwargs:
kwargs['cwd'] = decode(kwargs['cwd'])
return subprocess.check_output(*args, **kwargs)
@staticmethod
def Popen(*args, **kwargs):
if 'cwd' in kwargs:
kwargs['cwd'] = decode(kwargs['cwd'])
return subprocess.Popen(*args, **kwargs)
subproc = subprocess
if platform.system() == 'Windows' or 'PRETEND_UNICODE_FILENAMES' in os.environ:
subproc = SubprocessWrapper
class GitUtils(object): class GitUtils(object):
@staticmethod @staticmethod
def get_commit_count(repo, *args): def get_commit_count(repo, *args):
@ -1445,11 +1469,11 @@ class GitUtils(object):
args = ['--all'] args = ['--all']
if len(args) == 1 and isinstance(args[0], list): if len(args) == 1 and isinstance(args[0], list):
args = args[0] args = args[0]
p1 = subprocess.Popen(["git", "rev-list"] + args, p1 = subproc.Popen(["git", "rev-list"] + args,
bufsize=-1, bufsize=-1,
stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
cwd=repo) cwd=repo)
p2 = subprocess.Popen(["wc", "-l"], stdin=p1.stdout, stdout=subprocess.PIPE) p2 = subproc.Popen(["wc", "-l"], stdin=p1.stdout, stdout=subprocess.PIPE)
count = int(p2.communicate()[0]) count = int(p2.communicate()[0])
if p1.poll() != 0: if p1.poll() != 0:
raise SystemExit(_("%s does not appear to be a valid git repository") raise SystemExit(_("%s does not appear to be a valid git repository")
@ -1461,7 +1485,7 @@ class GitUtils(object):
""" """
Return the number of objects (both packed and unpacked) Return the number of objects (both packed and unpacked)
""" """
p1 = subprocess.Popen(["git", "count-objects", "-v"], p1 = subproc.Popen(["git", "count-objects", "-v"],
stdout=subprocess.PIPE, cwd=repo) stdout=subprocess.PIPE, cwd=repo)
lines = p1.stdout.read().splitlines() lines = p1.stdout.read().splitlines()
# Return unpacked objects + packed-objects # Return unpacked objects + packed-objects
@ -1469,14 +1493,14 @@ class GitUtils(object):
@staticmethod @staticmethod
def is_repository_bare(repo_working_dir): def is_repository_bare(repo_working_dir):
out = subprocess.check_output('git rev-parse --is-bare-repository'.split(), out = subproc.check_output('git rev-parse --is-bare-repository'.split(),
cwd=repo_working_dir) cwd=repo_working_dir)
return (out.strip() == b'true') return (out.strip() == b'true')
@staticmethod @staticmethod
def determine_git_dir(repo_working_dir): def determine_git_dir(repo_working_dir):
d = subprocess.check_output('git rev-parse --git-dir'.split(), d = subproc.check_output('git rev-parse --git-dir'.split(),
cwd=repo_working_dir).strip() cwd=repo_working_dir).strip()
if repo_working_dir==b'.' or d.startswith(b'/'): if repo_working_dir==b'.' or d.startswith(b'/'):
return d return d
return os.path.join(repo_working_dir, d) return os.path.join(repo_working_dir, d)
@ -1484,8 +1508,8 @@ class GitUtils(object):
@staticmethod @staticmethod
def get_refs(repo_working_dir): def get_refs(repo_working_dir):
try: try:
output = subprocess.check_output('git show-ref'.split(), output = subproc.check_output('git show-ref'.split(),
cwd=repo_working_dir) cwd=repo_working_dir)
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
# If error code is 1, there just aren't any refs; i.e. new repo. # If error code is 1, there just aren't any refs; i.e. new repo.
# If error code is other than 1, some other error (e.g. not a git repo) # If error code is other than 1, some other error (e.g. not a git repo)
@ -1502,9 +1526,9 @@ class GitUtils(object):
# Get sizes of blobs by sha1 # Get sizes of blobs by sha1
cmd = '--batch-check=%(objectname) %(objecttype) ' + \ cmd = '--batch-check=%(objectname) %(objecttype) ' + \
'%(objectsize) %(objectsize:disk)' '%(objectsize) %(objectsize:disk)'
cf = subprocess.Popen(['git', 'cat-file', '--batch-all-objects', cmd], cf = subproc.Popen(['git', 'cat-file', '--batch-all-objects', cmd],
bufsize = -1, bufsize = -1,
stdout = subprocess.PIPE) stdout = subprocess.PIPE)
unpacked_size = {} unpacked_size = {}
packed_size = {} packed_size = {}
for line in cf.stdout: for line in cf.stdout:
@ -1530,7 +1554,7 @@ class GitUtils(object):
file_changes = [] file_changes = []
cmd = ["git", "diff-tree", "-r", parent_hash, commit_hash] cmd = ["git", "diff-tree", "-r", parent_hash, commit_hash]
output = subprocess.check_output(cmd, cwd=repo) output = subproc.check_output(cmd, cwd=repo)
for line in output.splitlines(): for line in output.splitlines():
fileinfo, path = line.split(b'\t', 1) fileinfo, path = line.split(b'\t', 1)
if path.startswith(b'"'): if path.startswith(b'"'):
@ -1556,7 +1580,7 @@ class GitUtils(object):
br'\1@@LOCALEDIR@@"', contents) br'\1@@LOCALEDIR@@"', contents)
cmd = 'git hash-object --stdin'.split() cmd = 'git hash-object --stdin'.split()
version = subprocess.check_output(cmd, input=contents).strip() version = subproc.check_output(cmd, input=contents).strip()
print(decode(version[0:12])) print(decode(version[0:12]))
class FilteringOptions(object): class FilteringOptions(object):
@ -1961,8 +1985,8 @@ EXAMPLES
"incompatible.")) "incompatible."))
# Also throw some sanity checks on git version here; # Also throw some sanity checks on git version here;
# PERF: remove these checks once new enough git versions are common # PERF: remove these checks once new enough git versions are common
p = subprocess.Popen('git fast-export -h'.split(), p = subproc.Popen('git fast-export -h'.split(),
stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
p.wait() p.wait()
output = p.stdout.read() output = p.stdout.read()
if b'--mark-tags' not in output: # pragma: no cover if b'--mark-tags' not in output: # pragma: no cover
@ -1982,8 +2006,8 @@ EXAMPLES
args.preserve_commit_encoding = None args.preserve_commit_encoding = None
# If we don't have fast-exoprt --reencode, we may also be missing # If we don't have fast-exoprt --reencode, we may also be missing
# diff-tree --combined-all-paths, which is even more important... # diff-tree --combined-all-paths, which is even more important...
p = subprocess.Popen('git diff-tree -h'.split(), p = subproc.Popen('git diff-tree -h'.split(),
stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
p.wait() p.wait()
output = p.stdout.read() output = p.stdout.read()
if b'--combined-all-paths' not in output: if b'--combined-all-paths' not in output:
@ -2231,7 +2255,7 @@ class RepoAnalyze(object):
cmd = ('git rev-list --topo-order --reverse {}'.format(' '.join(args.refs)) + cmd = ('git rev-list --topo-order --reverse {}'.format(' '.join(args.refs)) +
' | git diff-tree --stdin --always --root --format=%H%n%P%n%cd' + ' | git diff-tree --stdin --always --root --format=%H%n%P%n%cd' +
' --date=short -M -t -c --raw --combined-all-paths') ' --date=short -M -t -c --raw --combined-all-paths')
dtp = subprocess.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE) dtp = subproc.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE)
f = dtp.stdout f = dtp.stdout
line = f.readline() line = f.readline()
if not line: if not line:
@ -2775,7 +2799,7 @@ class RepoFilter(object):
"To override, use --force.") % reason) "To override, use --force.") % reason)
# Make sure repo is fully packed, just like a fresh clone would be # Make sure repo is fully packed, just like a fresh clone would be
output = subprocess.check_output('git count-objects -v'.split()) output = subproc.check_output('git count-objects -v'.split())
stats = dict(x.split(b': ') for x in output.splitlines()) stats = dict(x.split(b': ') for x in output.splitlines())
num_packs = int(stats[b'packs']) num_packs = int(stats[b'packs'])
if stats[b'count'] != b'0' or num_packs > 1: if stats[b'count'] != b'0' or num_packs > 1:
@ -2783,7 +2807,7 @@ class RepoFilter(object):
# Make sure there is precisely one remote, named "origin"...or that this # Make sure there is precisely one remote, named "origin"...or that this
# is a new bare repo with no packs and no remotes # is a new bare repo with no packs and no remotes
output = subprocess.check_output('git remote'.split()).strip() output = subproc.check_output('git remote'.split()).strip()
if not (output == b"origin" or (num_packs == 0 and not output)): if not (output == b"origin" or (num_packs == 0 and not output)):
abort(_("expected one remote, origin")) abort(_("expected one remote, origin"))
@ -2813,11 +2837,11 @@ class RepoFilter(object):
# Do extra checks in non-bare repos # Do extra checks in non-bare repos
if not is_bare: if not is_bare:
# Avoid uncommitted, unstaged, or untracked changes # Avoid uncommitted, unstaged, or untracked changes
if subprocess.call('git diff --staged --quiet'.split()): if subproc.call('git diff --staged --quiet'.split()):
abort(_("you have uncommitted changes")) abort(_("you have uncommitted changes"))
if subprocess.call('git diff --quiet'.split()): if subproc.call('git diff --quiet'.split()):
abort(_("you have unstaged changes")) abort(_("you have unstaged changes"))
if len(subprocess.check_output('git ls-files -o'.split())) > 0: if len(subproc.check_output('git ls-files -o'.split())) > 0:
abort(_("you have untracked changes")) abort(_("you have untracked changes"))
# Avoid unpushed changes # Avoid unpushed changes
@ -2833,7 +2857,7 @@ class RepoFilter(object):
decode(origin_ref))) decode(origin_ref)))
# Make sure there is only one worktree # Make sure there is only one worktree
output = subprocess.check_output('git worktree list'.split()) output = subproc.check_output('git worktree list'.split())
if len(output.splitlines()) > 1: if len(output.splitlines()) > 1:
abort(_('you have multiple worktrees')) abort(_('you have multiple worktrees'))
@ -2858,7 +2882,7 @@ class RepoFilter(object):
for cmd in cleanup_cmds: for cmd in cleanup_cmds:
if show_debuginfo: if show_debuginfo:
print("[DEBUG] Running{}: {}".format(location_info, ' '.join(cmd))) print("[DEBUG] Running{}: {}".format(location_info, ' '.join(cmd)))
subprocess.call(cmd, cwd=repo) subproc.call(cmd, cwd=repo)
def _get_rename(self, old_hash): def _get_rename(self, old_hash):
# If we already know the rename, just return it # If we already know the rename, just return it
@ -3377,11 +3401,11 @@ class RepoFilter(object):
working_dir = self._args.target or b'.' working_dir = self._args.target or b'.'
cmd = ['git', '-C', working_dir, 'show-ref', full_branch] cmd = ['git', '-C', working_dir, 'show-ref', full_branch]
contents = b'' contents = b''
if subprocess.call(cmd, stdout=subprocess.DEVNULL) == 0: if subproc.call(cmd, stdout=subprocess.DEVNULL) == 0:
cmd = ['git', '-C', working_dir, 'show', cmd = ['git', '-C', working_dir, 'show',
'%s:%s' % (full_branch, decode(marks_basename))] '%s:%s' % (full_branch, decode(marks_basename))]
try: try:
contents = subprocess.check_output(cmd) contents = subproc.check_output(cmd)
except subprocess.CalledProcessError as e: # pragma: no cover except subprocess.CalledProcessError as e: # pragma: no cover
raise SystemExit(_("Failed loading %s from %s") % raise SystemExit(_("Failed loading %s from %s") %
(decode(marks_basename), branch)) (decode(marks_basename), branch))
@ -3400,7 +3424,7 @@ class RepoFilter(object):
parent = [] parent = []
full_branch = 'refs/heads/{}'.format(self._args.state_branch) full_branch = 'refs/heads/{}'.format(self._args.state_branch)
cmd = ['git', '-C', working_dir, 'show-ref', full_branch] cmd = ['git', '-C', working_dir, 'show-ref', full_branch]
if subprocess.call(cmd, stdout=subprocess.DEVNULL) == 0: if subproc.call(cmd, stdout=subprocess.DEVNULL) == 0:
parent = ['-p', full_branch] parent = ['-p', full_branch]
# Run 'git hash-object $MARKS_FILE' for each marks file, save result # Run 'git hash-object $MARKS_FILE' for each marks file, save result
@ -3411,11 +3435,11 @@ class RepoFilter(object):
raise SystemExit(_("Failed to find %s to save to %s") raise SystemExit(_("Failed to find %s to save to %s")
% (marks_file, self._args.state_branch)) % (marks_file, self._args.state_branch))
cmd = ['git', '-C', working_dir, 'hash-object', '-w', marks_file] cmd = ['git', '-C', working_dir, 'hash-object', '-w', marks_file]
blob_hashes[marks_basename] = subprocess.check_output(cmd).strip() blob_hashes[marks_basename] = subproc.check_output(cmd).strip()
# Run 'git mktree' to create a tree out of it # Run 'git mktree' to create a tree out of it
p = subprocess.Popen(['git', '-C', working_dir, 'mktree'], p = subproc.Popen(['git', '-C', working_dir, 'mktree'],
stdin=subprocess.PIPE, stdout=subprocess.PIPE) stdin=subprocess.PIPE, stdout=subprocess.PIPE)
for b in basenames: for b in basenames:
p.stdin.write(b'100644 blob %s\t%s\n' % (blob_hashes[b], b)) p.stdin.write(b'100644 blob %s\t%s\n' % (blob_hashes[b], b))
p.stdin.close() p.stdin.close()
@ -3425,9 +3449,8 @@ class RepoFilter(object):
# Create the new commit # Create the new commit
cmd = (['git', '-C', working_dir, 'commit-tree', '-m', 'New mark files', cmd = (['git', '-C', working_dir, 'commit-tree', '-m', 'New mark files',
tree] + parent) tree] + parent)
commit = subprocess.check_output(cmd).strip() commit = subproc.check_output(cmd).strip()
subprocess.call(['git', '-C', working_dir, 'update-ref', subproc.call(['git', '-C', working_dir, 'update-ref', full_branch, commit])
full_branch, commit])
def importer_only(self): def importer_only(self):
self._run_sanity_checks() self._run_sanity_checks()
@ -3479,7 +3502,7 @@ class RepoFilter(object):
'--signed-tags=strip', '--tag-of-filtered-object=rewrite', '--signed-tags=strip', '--tag-of-filtered-object=rewrite',
'--fake-missing-tagger', '--reference-excluded-parents' '--fake-missing-tagger', '--reference-excluded-parents'
] + extra_flags + self._args.refs ] + extra_flags + self._args.refs
self._fep = subprocess.Popen(fep_cmd, bufsize=-1, stdout=subprocess.PIPE) self._fep = subproc.Popen(fep_cmd, bufsize=-1, stdout=subprocess.PIPE)
self._input = self._fep.stdout self._input = self._fep.stdout
if self._args.dry_run or self._args.debug: if self._args.dry_run or self._args.debug:
self._fe_orig = os.path.join(self.results_tmp_dir(), self._fe_orig = os.path.join(self.results_tmp_dir(),
@ -3500,10 +3523,8 @@ class RepoFilter(object):
target_marks_file = self._load_marks_file(b'target-marks') target_marks_file = self._load_marks_file(b'target-marks')
fip_cmd.extend([b'--export-marks='+target_marks_file, fip_cmd.extend([b'--export-marks='+target_marks_file,
b'--import-marks='+target_marks_file]) b'--import-marks='+target_marks_file])
self._fip = subprocess.Popen(fip_cmd, self._fip = subproc.Popen(fip_cmd, bufsize=-1,
bufsize=-1, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
stdin=subprocess.PIPE,
stdout=subprocess.PIPE)
self._import_pipes = (self._fip.stdin, self._fip.stdout) self._import_pipes = (self._fip.stdin, self._fip.stdout)
if self._args.dry_run or self._args.debug: if self._args.dry_run or self._args.debug:
self._fe_filt = os.path.join(self.results_tmp_dir(), self._fe_filt = os.path.join(self.results_tmp_dir(),
@ -3526,9 +3547,8 @@ class RepoFilter(object):
if self._args.debug: if self._args.debug:
print("[DEBUG] Migrating refs/remotes/origin/* -> refs/heads/*") print("[DEBUG] Migrating refs/remotes/origin/* -> refs/heads/*")
target_working_dir = self._args.target or b'.' target_working_dir = self._args.target or b'.'
p = subprocess.Popen('git update-ref --no-deref --stdin'.split(), p = subproc.Popen('git update-ref --no-deref --stdin'.split(),
stdin=subprocess.PIPE, stdin=subprocess.PIPE, cwd=target_working_dir)
cwd=target_working_dir)
for ref in refs_to_migrate: for ref in refs_to_migrate:
if ref == b'refs/remotes/origin/HEAD': if ref == b'refs/remotes/origin/HEAD':
p.stdin.write(b'delete %s %s\n' % (ref, self._orig_refs[ref])) p.stdin.write(b'delete %s %s\n' % (ref, self._orig_refs[ref]))
@ -3548,7 +3568,7 @@ class RepoFilter(object):
if self._args.debug: if self._args.debug:
print("[DEBUG] Removing 'origin' remote (rewritten history will no ") print("[DEBUG] Removing 'origin' remote (rewritten history will no ")
print(" longer be related; consider re-pushing it elsewhere.") print(" longer be related; consider re-pushing it elsewhere.")
subprocess.call('git remote rm origin'.split(), cwd=target_working_dir) subproc.call('git remote rm origin'.split(), cwd=target_working_dir)
def _final_commands(self): def _final_commands(self):
self._finalize_handled = True self._finalize_handled = True
@ -3559,9 +3579,9 @@ class RepoFilter(object):
def _ref_update(self, target_working_dir): def _ref_update(self, target_working_dir):
# Start the update-ref process # Start the update-ref process
p = subprocess.Popen('git update-ref --no-deref --stdin'.split(), p = subproc.Popen('git update-ref --no-deref --stdin'.split(),
stdin=subprocess.PIPE, stdin=subprocess.PIPE,
cwd=target_working_dir) cwd=target_working_dir)
# Remove replace_refs from _orig_refs # Remove replace_refs from _orig_refs
replace_refs = {k:v for k, v in self._orig_refs.items() replace_refs = {k:v for k, v in self._orig_refs.items()
@ -3636,10 +3656,10 @@ class RepoFilter(object):
if not batch_check_process: if not batch_check_process:
cmd = 'git cat-file --batch-check'.split() cmd = 'git cat-file --batch-check'.split()
target_working_dir = self._args.target or b'.' target_working_dir = self._args.target or b'.'
batch_check_process = subprocess.Popen(cmd, batch_check_process = subproc.Popen(cmd,
stdin=subprocess.PIPE, stdin=subprocess.PIPE,
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
cwd=target_working_dir) cwd=target_working_dir)
batch_check_process.stdin.write(refname+b"\n") batch_check_process.stdin.write(refname+b"\n")
batch_check_process.stdin.flush() batch_check_process.stdin.flush()
line = batch_check_process.stdout.readline() line = batch_check_process.stdout.readline()

View File

@ -16,7 +16,10 @@ EOF
export COVERAGE_PROCESS_START=$tmpdir/.coveragerc export COVERAGE_PROCESS_START=$tmpdir/.coveragerc
export PYTHONPATH=$tmpdir: export PYTHONPATH=$tmpdir:
# We pretend filenames are unicode for two reasons: (1) because it exercises
# more code, and (2) this setting will detect accidental use of unicode strings
# for file/directory names when it should always be bytestrings.
export PRETEND_UNICODE_FILENAMES=1
ls t939*.sh | xargs -n 1 bash ls t939*.sh | xargs -n 1 bash