From e5955f397f53f5f12ec0881096287eea27ee8573 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Mon, 1 Apr 2019 14:35:49 -0700 Subject: [PATCH 01/17] filter-repo (python3): shebang and imports Signed-off-by: Elijah Newren --- git-filter-repo | 8 +++----- t/t9391-filter-repo-lib-usage.sh | 2 +- t/t9391/commit_info.py | 2 +- t/t9391/create_fast_export_output.py | 2 +- t/t9391/erroneous.py | 2 +- t/t9391/file_filter.py | 2 +- t/t9391/print_progress.py | 6 +++--- t/t9391/rename-master-to-develop.py | 2 +- t/t9391/splice_repos.py | 2 +- t/t9391/strip-cvs-keywords.py | 2 +- t/t9391/unusual.py | 6 +++--- 11 files changed, 17 insertions(+), 19 deletions(-) diff --git a/git-filter-repo b/git-filter-repo index 07c769c..6fb102a 100755 --- a/git-filter-repo +++ b/git-filter-repo @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ git-filter-repo filters git repositories, similar to git filter-branch, BFG @@ -30,8 +30,6 @@ operations; however: ***** END API BACKWARD COMPATIBILITY CAVEAT ***** """ -from __future__ import print_function - import argparse import collections import fnmatch @@ -39,7 +37,7 @@ import gettext import os import re import shutil -import StringIO +import io import subprocess import sys import time @@ -408,7 +406,7 @@ class _GitElement(object): Convert GitElement to string; used for debugging """ old_dumped = self.dumped - writeme = StringIO.StringIO() + writeme = io.StringIO() self.dump(writeme) output_lines = writeme.getvalue().splitlines() writeme.close() diff --git a/t/t9391-filter-repo-lib-usage.sh b/t/t9391-filter-repo-lib-usage.sh index e923d29..a967f31 100755 --- a/t/t9391-filter-repo-lib-usage.sh +++ b/t/t9391-filter-repo-lib-usage.sh @@ -158,7 +158,7 @@ test_expect_success 'other error cases' ' mkdir other && cd other && - ! python -c "import git_filter_repo as fr; fr.GitUtils.get_commit_count(\".\", [\"HEAD\"])" 2>err && + ! python3 -c "import git_filter_repo as fr; fr.GitUtils.get_commit_count(\".\", [\"HEAD\"])" 2>err && test_i18ngrep ". does not appear to be a valid git repository" err ) ' diff --git a/t/t9391/commit_info.py b/t/t9391/commit_info.py index e697bd8..a0d34f3 100755 --- a/t/t9391/commit_info.py +++ b/t/t9391/commit_info.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ Please see the diff --git a/t/t9391/create_fast_export_output.py b/t/t9391/create_fast_export_output.py index a1b21e0..e2ef13c 100755 --- a/t/t9391/create_fast_export_output.py +++ b/t/t9391/create_fast_export_output.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ Please see the diff --git a/t/t9391/erroneous.py b/t/t9391/erroneous.py index a5c05d2..db6051b 100755 --- a/t/t9391/erroneous.py +++ b/t/t9391/erroneous.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ Please see the diff --git a/t/t9391/file_filter.py b/t/t9391/file_filter.py index f6a1ae9..8540b7d 100755 --- a/t/t9391/file_filter.py +++ b/t/t9391/file_filter.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ Please see the diff --git a/t/t9391/print_progress.py b/t/t9391/print_progress.py index 5256b74..bbca538 100755 --- a/t/t9391/print_progress.py +++ b/t/t9391/print_progress.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ Please see the @@ -21,8 +21,8 @@ commit_count = 0 def print_progress(): global object_count, commit_count, total_objects, total_commits - print "\rRewriting commits... %d/%d (%d objects)" \ - % (commit_count, total_commits, object_count), + print("\rRewriting commits... %d/%d (%d objects)" + % (commit_count, total_commits, object_count), end='') def my_blob_callback(blob): global object_count diff --git a/t/t9391/rename-master-to-develop.py b/t/t9391/rename-master-to-develop.py index f92517a..7a922d0 100755 --- a/t/t9391/rename-master-to-develop.py +++ b/t/t9391/rename-master-to-develop.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ Please see the diff --git a/t/t9391/splice_repos.py b/t/t9391/splice_repos.py index 00d0058..c7834c7 100755 --- a/t/t9391/splice_repos.py +++ b/t/t9391/splice_repos.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ Please see the diff --git a/t/t9391/strip-cvs-keywords.py b/t/t9391/strip-cvs-keywords.py index 1067d55..ccd3c8d 100755 --- a/t/t9391/strip-cvs-keywords.py +++ b/t/t9391/strip-cvs-keywords.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ Please see the diff --git a/t/t9391/unusual.py b/t/t9391/unusual.py index 3167c0f..6a61dbe 100755 --- a/t/t9391/unusual.py +++ b/t/t9391/unusual.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # Please: DO NOT USE THIS AS AN EXAMPLE. # @@ -14,7 +14,7 @@ import collections import os import random -import StringIO +import io import sys import textwrap @@ -71,7 +71,7 @@ print("Found {} blobs/commits and {} other objects" .format(total_objects['common'], total_objects['uncommon'])) -stream = StringIO.StringIO(textwrap.dedent(''' +stream = io.StringIO(textwrap.dedent(''' blob mark :1 data 5 From 511a8f52f879846ddb7cbe09b7030ed83da3479d Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Mon, 1 Apr 2019 14:36:57 -0700 Subject: [PATCH 02/17] filter-repo (python3): iteritems() -> items() Signed-off-by: Elijah Newren --- git-filter-repo | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/git-filter-repo b/git-filter-repo index 6fb102a..09758a1 100755 --- a/git-filter-repo +++ b/git-filter-repo @@ -268,7 +268,7 @@ class MailmapInfo(object): ''' Given a name and email, return the expected new name and email from the mailmap if there is a translation rule for it, otherwise just return the given name and email.''' - for old, new in self.changes.iteritems(): + for old, new in self.changes.items(): old_name, old_email = old new_name, new_email = new if (email == old_email or not old_email) and ( @@ -1585,7 +1585,7 @@ class FastExportFilter(object): def _handle_final_commands(self): self._finalize_handled = True - for ref, value in self._seen_refs.iteritems(): + for ref, value in self._seen_refs.items(): if value is not None: # Create a reset reset = Reset(ref, value) @@ -1604,13 +1604,13 @@ class FastExportFilter(object): self._flush_renames() with open(os.path.join(metadata_dir, 'commit-map'), 'w') as f: f.write("%-40s %s\n" % (_("old"), _("new"))) - for (old,new) in self._commit_renames.iteritems(): + for (old,new) in self._commit_renames.items(): f.write('{} {}\n'.format(old, new if new != None else deleted_hash)) batch_check_process = None batch_check_output_re = re.compile('^([0-9a-f]{40}) ([a-z]+) ([0-9]+)$') with open(os.path.join(metadata_dir, 'ref-map'), 'w') as f: - for refname, old_hash in orig_refs.iteritems(): + for refname, old_hash in orig_refs.items(): if refname in refs_nuked: new_hash = deleted_hash elif old_hash in self._commit_renames: @@ -2412,7 +2412,7 @@ class RepoAnalyze(object): # too. with open(os.path.join(reportdir, "renames.txt"), 'w') as f: seen = set() - for pathname,equiv_group in sorted(stats['equivalence'].iteritems(), + for pathname,equiv_group in sorted(stats['equivalence'].items(), key=lambda x:(x[1], x[0])): if equiv_group in seen: continue @@ -2425,7 +2425,7 @@ class RepoAnalyze(object): with open(os.path.join(reportdir, "directories-deleted-sizes.txt"), 'w') as f: f.write("=== %s ===\n" % _("Deleted directories by reverse size")) f.write(_("Format: unpacked size, packed size, date deleted, directory name\n")) - for dirname, size in sorted(dir_size['packed'].iteritems(), + for dirname, size in sorted(dir_size['packed'].items(), key=lambda x:(x[1],x[0]), reverse=True): if (dir_deleted_data[dirname]): f.write(" {:10d} {:10d} {:10s} {}\n" @@ -2437,7 +2437,7 @@ class RepoAnalyze(object): with open(os.path.join(reportdir, "directories-all-sizes.txt"), 'w') as f: f.write("=== %s ===\n" % _("All directories by reverse size")) f.write(_("Format: unpacked size, packed size, date deleted, directory name\n")) - for dirname, size in sorted(dir_size['packed'].iteritems(), + for dirname, size in sorted(dir_size['packed'].items(), key=lambda x:(x[1],x[0]), reverse=True): f.write(" {:10d} {:10d} {:10s} {}\n" .format(dir_size['unpacked'][dirname], @@ -2449,7 +2449,7 @@ class RepoAnalyze(object): with open(os.path.join(reportdir, "extensions-deleted-sizes.txt"), 'w') as f: f.write("=== %s ===\n" % _("Deleted extensions by reverse size")) f.write(_("Format: unpacked size, packed size, date deleted, extension name\n")) - for extname, size in sorted(ext_size['packed'].iteritems(), + for extname, size in sorted(ext_size['packed'].items(), key=lambda x:(x[1],x[0]), reverse=True): if (ext_deleted_data[extname]): f.write(" {:10d} {:10d} {:10s} {}\n" @@ -2461,7 +2461,7 @@ class RepoAnalyze(object): with open(os.path.join(reportdir, "extensions-all-sizes.txt"), 'w') as f: f.write("=== %s ===\n" % _("All extensions by reverse size")) f.write(_("Format: unpacked size, packed size, date deleted, extension name\n")) - for extname, size in sorted(ext_size['packed'].iteritems(), + for extname, size in sorted(ext_size['packed'].items(), key=lambda x:(x[1],x[0]), reverse=True): f.write(" {:10d} {:10d} {:10s} {}\n" .format(ext_size['unpacked'][extname], @@ -2473,7 +2473,7 @@ class RepoAnalyze(object): with open(os.path.join(reportdir, "path-deleted-sizes.txt"), 'w') as f: f.write("=== %s ===\n" % _("Deleted paths by reverse accumulated size")) f.write(_("Format: unpacked size, packed size, date deleted, path name(s)\n")) - for pathname, size in sorted(path_size['packed'].iteritems(), + for pathname, size in sorted(path_size['packed'].items(), key=lambda x:(x[1],x[0]), reverse=True): when = stats['file_deletions'].get(pathname, None) if when: @@ -2486,7 +2486,7 @@ class RepoAnalyze(object): with open(os.path.join(reportdir, "path-all-sizes.txt"), 'w') as f: f.write("=== %s ===\n" % _("All paths by reverse accumulated size")) f.write(_("Format: unpacked size, packed size, date deleted, pathectory name\n")) - for pathname, size in sorted(path_size['packed'].iteritems(), + for pathname, size in sorted(path_size['packed'].items(), key=lambda x:(x[1],x[0]), reverse=True): when = stats['file_deletions'].get(pathname, None) f.write(" {:10d} {:10d} {:10s} {}\n" @@ -2499,7 +2499,7 @@ class RepoAnalyze(object): with open(os.path.join(reportdir, "blob-shas-and-paths.txt"), 'w') as f: f.write("=== %s ===\n" % _("Files by sha and associated pathnames in reverse size")) f.write(_("Format: sha, unpacked size, packed size, filename(s) object stored as\n")) - for sha, size in sorted(stats['packed_size'].iteritems(), + for sha, size in sorted(stats['packed_size'].items(), key=lambda x:(x[1],x[0]), reverse=True): if sha not in stats['names']: # Some objects in the repository might not be referenced, or not @@ -2723,7 +2723,7 @@ class RepoFilter(object): abort(_("you have untracked changes")) # Avoid unpushed changes - for refname, rev in refs.iteritems(): + for refname, rev in refs.items(): if not refname.startswith('refs/heads/'): continue origin_ref = refname.replace('refs/heads/', 'refs/remotes/origin/') From 468ef568cf863e03bd9c1b28267eb96d2378d926 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Mon, 1 Apr 2019 14:38:02 -0700 Subject: [PATCH 03/17] filter-repo (python3): xrange() -> range() Signed-off-by: Elijah Newren --- git-filter-repo | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/git-filter-repo b/git-filter-repo index 09758a1..197664f 100755 --- a/git-filter-repo +++ b/git-filter-repo @@ -126,7 +126,7 @@ class PathQuoting: '"': '"', '\\':'\\'} _unescape_re = re.compile(r'\\([a-z"\\]|[0-9]{3})') - _escape = [chr(x) for x in xrange(127)]+['\\'+oct(x)[1:] for x in xrange(127,256)] + _escape = [chr(x) for x in range(127)]+['\\'+oct(x)[1:] for x in range(127,256)] _reverse = dict(map(reversed, _unescape.items())) for x in _reverse: _escape[ord(x)] = '\\'+_reverse[x] @@ -1217,10 +1217,10 @@ class FastExportFilter(object): # ancestor of another parent.) num_parents = len(parents) to_remove = [] - for cur in xrange(num_parents): + for cur in range(num_parents): if not is_rewritten[cur]: continue - for other in xrange(num_parents): + for other in range(num_parents): if cur == other: continue if not self._graph.is_ancestor(parents[cur], parents[other]): From 2562f0270c41599eeed3a1fe26f03107596b72af Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Tue, 30 Apr 2019 23:46:05 -0700 Subject: [PATCH 04/17] filter-repo (python3): revert "workaround python<2.7.9 exec bug" Commit ca32c5d9afe2 ("filter-repo: workaround python<2.7.9 exec bug", 2019-04-30) put in a workaround for python versions prior to 2.7.9, but which was incompatible with python3. Revert it as one step towards migrating to python3. Signed-off-by: Elijah Newren --- git-filter-repo | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/git-filter-repo b/git-filter-repo index 197664f..8039fdd 100755 --- a/git-filter-repo +++ b/git-filter-repo @@ -2619,8 +2619,8 @@ class RepoFilter(object): def _handle_arg_callbacks(self): def make_callback(argname, str): - exec 'def callback({}):\n'.format(argname)+\ - ' '+'\n '.join(str.splitlines()) in globals() + exec('def callback({}):\n'.format(argname)+ + ' '+'\n '.join(str.splitlines()), globals()) return callback #namespace['callback'] def handle(type): callback_field = '_{}_callback'.format(type) From 1a8e247ba72afdf79ff28bbf94693ed379cca2e5 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Fri, 26 Apr 2019 17:53:00 -0700 Subject: [PATCH 05/17] filter-repo (python3): add a decode() function We need a function to transform byte strings into unicode strings for printing error messages and occasional other uses. Signed-off-by: Elijah Newren --- git-filter-repo | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/git-filter-repo b/git-filter-repo index 8039fdd..5420acd 100755 --- a/git-filter-repo +++ b/git-filter-repo @@ -115,6 +115,10 @@ def date_to_string(dateobj): return('{} {}'.format(int(_timedelta_to_seconds(dateobj - epoch)), dateobj.tzinfo.tzname(0))) +def decode(bytestr): + 'Try to convert bytestr to utf-8 for outputting as an error message.' + return bytestr.decode('utf-8', 'backslashreplace') + class PathQuoting: _unescape = {'a': '\a', 'b': '\b', From ad3c839263be4ff7798ed8ae7cf5cf7f89806138 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Fri, 26 Apr 2019 17:59:50 -0700 Subject: [PATCH 06/17] filter-repo (python3): handle conversion of glob to regex python3 forces a couple issues for us with the conversion of globs to regexes: * fnmatch.translate() will ONLY operate on unicode strings, not bytestrings. Super lame. * newer versions of python3 modified the regex style used by fnmatch.translate() causing us to need extra logic to 'fixup' the regex into the form we want. Split the code for translating the glob to a regex out into a separate function which now houses more complicated logic to handle these extra conditions. Signed-off-by: Elijah Newren --- git-filter-repo | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/git-filter-repo b/git-filter-repo index 5420acd..399a921 100755 --- a/git-filter-repo +++ b/git-filter-repo @@ -119,6 +119,33 @@ def decode(bytestr): 'Try to convert bytestr to utf-8 for outputting as an error message.' return bytestr.decode('utf-8', 'backslashreplace') +def glob_to_regex(glob_bytestr): + 'Translate glob_bytestr into a regex on bytestrings' + + # fnmatch.translate is idiotic and won't accept bytestrings + if (decode(glob_bytestr).encode() != glob_bytestr): # pragma: no cover + raise SystemExit(_("Error: Cannot handle glob %s").format(glob_bytestr)) + + # Create regex operating on string + regex = fnmatch.translate(decode(glob_bytestr)) + + # FIXME: This is an ugly hack... + # fnmatch.translate tries to do multi-line matching and wants the glob to + # match up to the end of the input, which isn't relevant for us, so we + # have to modify the regex. fnmatch.translate has used different regex + # constructs to achieve this with different python versions, so we have + # to check for each of them and then fix it up. It would be much better + # if fnmatch.translate could just take some flags to allow us to specify + # what we want rather than employing this hackery, but since it + # doesn't... + if regex.endswith(r'\Z(?ms)'): # pragma: no cover + regex = regex[0:-7] + elif regex.startswith(r'(?s:') and regex.endswith(r')\Z'): # pragma: no cover + regex = regex[4:-3] + + # Finally, convert back to regex operating on bytestr + return regex.encode() + class PathQuoting: _unescape = {'a': '\a', 'b': '\b', @@ -2037,9 +2064,7 @@ class FilteringOptions(object): if line.startswith('regex:'): regex = line[6:] elif line.startswith('glob:'): - regex = fnmatch.translate(line[5:]) - if regex.endswith(r'\Z(?ms)'): - regex = regex[0:-7] + regex = glob_to_regex(line[5:]) if regex: replace_regexes.append((re.compile(regex), replacement)) else: From 6e78788feb879379d0fe541a6f5c9fd90dd8d643 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Mon, 1 Apr 2019 14:49:28 -0700 Subject: [PATCH 07/17] filter-repo (python3): more flush()ing needed under python3 Signed-off-by: Elijah Newren --- git-filter-repo | 1 + 1 file changed, 1 insertion(+) diff --git a/git-filter-repo b/git-filter-repo index 399a921..4d2562c 100755 --- a/git-filter-repo +++ b/git-filter-repo @@ -1655,6 +1655,7 @@ class FastExportFilter(object): stdout=subprocess.PIPE, cwd=self._repo_working_dir) batch_check_process.stdin.write(refname+"\n") + batch_check_process.stdin.flush() line = batch_check_process.stdout.readline() m = batch_check_output_re.match(line) if not m or m.group(2) != 'tag': From effcd5b9ff01cb1a2cfc68d30167c62882fbc424 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Fri, 26 Apr 2019 18:09:21 -0700 Subject: [PATCH 08/17] filter-repo (python3): convert run_coverage Signed-off-by: Elijah Newren --- t/run_coverage | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/t/run_coverage b/t/run_coverage index 0e2fe74..3abd9af 100755 --- a/t/run_coverage +++ b/t/run_coverage @@ -21,8 +21,8 @@ export PYTHONPATH=$tmpdir: ls t939*.sh | xargs -n 1 bash cd $tmpdir -python-coverage combine -python-coverage html -d $orig_dir/report -python-coverage report -m +python3-coverage combine +python3-coverage html -d $orig_dir/report +python3-coverage report -m cd $orig_dir rm -rf $tmpdir From 8b8d6b4b43720fc731b7e4ce98d82f00a781f76c Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 27 Apr 2019 11:32:40 -0700 Subject: [PATCH 09/17] filter-repo (python3): ensure stdin and args are bytes instead of strings Signed-off-by: Elijah Newren --- git-filter-repo | 22 +++++++++++++--------- t/t9391/unusual.py | 4 ++-- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/git-filter-repo b/git-filter-repo index 4d2562c..3645697 100755 --- a/git-filter-repo +++ b/git-filter-repo @@ -1897,16 +1897,17 @@ class FilteringOptions(object): "files matching none of those options.")) path.add_argument('--path-match', '--path', metavar='DIR_OR_FILE', + type=os.fsencode, action=FilteringOptions.AppendFilter, dest='path_changes', help=_("Exact paths (files or directories) to include in filtered " "history. Multiple --path options can be specified to get " "a union of paths.")) - path.add_argument('--path-glob', metavar='GLOB', + path.add_argument('--path-glob', metavar='GLOB', type=os.fsencode, action=FilteringOptions.AppendFilter, dest='path_changes', help=_("Glob of paths to include in filtered history. Multiple " "--path-glob options can be specified to get a union of " "paths.")) - path.add_argument('--path-regex', metavar='REGEX', + path.add_argument('--path-regex', metavar='REGEX', type=os.fsencode, action=FilteringOptions.AppendFilter, dest='path_changes', help=_("Regex of paths to include in filtered history. Multiple " "--path-regex options can be specified to get a union of " @@ -1914,31 +1915,32 @@ class FilteringOptions(object): rename = parser.add_argument_group(title=_("Renaming based on paths")) rename.add_argument('--path-rename', '--path-rename-prefix', - metavar='OLD_NAME:NEW_NAME', dest='path_changes', + metavar='OLD_NAME:NEW_NAME', dest='path_changes', type=os.fsencode, action=FilteringOptions.AppendFilter, help=_("Prefix to rename; if filename starts with OLD_NAME, " "replace that with NEW_NAME. Multiple --path-rename " "options can be specified.")) refrename = parser.add_argument_group(title=_("Renaming of refs")) - refrename.add_argument('--tag-rename', metavar='OLD:NEW', + refrename.add_argument('--tag-rename', metavar='OLD:NEW', type=os.fsencode, help=_("Rename tags starting with OLD to start with NEW. For " "example, --tag-rename foo:bar will rename tag foo-1.2.3 " "to bar-1.2.3; either OLD or NEW can be empty.")) helpers = parser.add_argument_group(title=_("Shortcuts")) helpers.add_argument('--subdirectory-filter', metavar='DIRECTORY', - action=FilteringOptions.HelperFilter, + action=FilteringOptions.HelperFilter, type=os.fsencode, help=_("Only look at history that touches the given subdirectory " "and treat that directory as the project root. Equivalent " "to using '--path DIRECTORY/ --path-rename DIRECTORY/:'")) helpers.add_argument('--to-subdirectory-filter', metavar='DIRECTORY', - action=FilteringOptions.HelperFilter, + action=FilteringOptions.HelperFilter, type=os.fsencode, help=_("Treat the project root as instead being under DIRECTORY. " "Equivalent to using '--path-rename :DIRECTORY/'")) people = parser.add_argument_group(title=_("Filtering of names/emails")) people.add_argument('--mailmap', dest='mailmap', metavar='FILENAME', + type=os.fsencode, help=_("Use specified mailmap file (see git-shortlog(1) for " "details on the format) when rewriting author, committer, " "and tagger names and emails. If the specified file is " @@ -1988,8 +1990,9 @@ class FilteringOptions(object): "CALLBACKS section below.")) location = parser.add_argument_group(title=_("Location to filter from/to")) - location.add_argument('--source', help=_("Git repository to read from")) - location.add_argument('--target', + location.add_argument('--source', type=os.fsencode, + help=_("Git repository to read from")) + location.add_argument('--target', type=os.fsencode, help=_("Git repository to overwrite with filtered history")) misc = parser.add_argument_group(title=_("Miscellaneous options")) @@ -2947,7 +2950,8 @@ class RepoFilter(object): def _setup_input(self, use_done_feature): if self._args.stdin: - self._input = sys.stdin + self._input = sys.stdin.detach() + sys.stdin = None # Make sure no one tries to accidentally use it self._fe_orig = None else: skip_blobs = (self._blob_callback is None and diff --git a/t/t9391/unusual.py b/t/t9391/unusual.py index 6a61dbe..684c105 100755 --- a/t/t9391/unusual.py +++ b/t/t9391/unusual.py @@ -58,8 +58,8 @@ filter = fr.FastExportFilter('.', checkpoint_callback = handle_checkpoint, everything_callback = track_everything) -filter.run(input = sys.stdin, - output = open(os.devnull, 'w'), +filter.run(input = sys.stdin.detach(), + output = open(os.devnull, 'wb'), fast_import_pipes = None, quiet = True) # DO NOT depend upon or use _IDS directly you external script writers. I'm From 9b3134b68ce94488e8addc93a1830be48a47c82e Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 27 Apr 2019 11:47:12 -0700 Subject: [PATCH 10/17] filter-repo (python3): ensure file reads and writes are done in bytes Signed-off-by: Elijah Newren --- git-filter-repo | 36 ++++++++++++++++++------------------ t/t9391/unusual.py | 2 +- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/git-filter-repo b/git-filter-repo index 3645697..e006c5a 100755 --- a/git-filter-repo +++ b/git-filter-repo @@ -266,7 +266,7 @@ class MailmapInfo(object): comment_re = re.compile(r'\s*#.*') if not os.access(filename, os.R_OK): raise SystemExit(_("Cannot read %s") % filename) - with open(filename) as f: + with open(filename, 'br') as f: count = 0 for line in f: count += 1 @@ -1633,14 +1633,14 @@ class FastExportFilter(object): def record_metadata(self, metadata_dir, orig_refs, refs_nuked): deleted_hash = '0'*40 self._flush_renames() - with open(os.path.join(metadata_dir, 'commit-map'), 'w') as f: + with open(os.path.join(metadata_dir, 'commit-map'), 'bw') as f: f.write("%-40s %s\n" % (_("old"), _("new"))) for (old,new) in self._commit_renames.items(): f.write('{} {}\n'.format(old, new if new != None else deleted_hash)) batch_check_process = None batch_check_output_re = re.compile('^([0-9a-f]{40}) ([a-z]+) ([0-9]+)$') - with open(os.path.join(metadata_dir, 'ref-map'), 'w') as f: + with open(os.path.join(metadata_dir, 'ref-map'), 'bw') as f: for refname, old_hash in orig_refs.items(): if refname in refs_nuked: new_hash = deleted_hash @@ -1669,7 +1669,7 @@ class FastExportFilter(object): batch_check_process.stdin.close() batch_check_process.wait() - with open(os.path.join(metadata_dir, 'suboptimal-issues'), 'w') as f: + with open(os.path.join(metadata_dir, 'suboptimal-issues'), 'bw') as f: issues_found = False if self._commits_no_longer_merges: issues_found = True @@ -1698,7 +1698,7 @@ class FastExportFilter(object): if not issues_found: f.write(_("No filtering problems encountered.")) - with open(os.path.join(metadata_dir, 'already_ran'), 'w') as f: + with open(os.path.join(metadata_dir, 'already_ran'), 'bw') as f: f.write(_("This file exists to allow you to filter again without --force.")) def get_seen_refs(self): @@ -2054,7 +2054,7 @@ class FilteringOptions(object): def get_replace_text(filename): replace_literals = [] replace_regexes = [] - with open(filename) as f: + with open(filename, 'br') as f: for line in f: line = line.rstrip('\r\n') @@ -2353,7 +2353,7 @@ class RepoAnalyze(object): for name in dir_size['packed']: dir_deleted_data[name] = stats['tree_deletions'].get(name, None) - with open(os.path.join(reportdir, "README"), 'w') as f: + with open(os.path.join(reportdir, "README"), 'bw') as f: # Give a basic overview of this file f.write("== %s ==\n" % _("Overall Statistics")) f.write(" %s: %d\n" % (_("Number of commits"), @@ -2443,7 +2443,7 @@ class RepoAnalyze(object): # Equivalence classes for names, so if folks only want to keep a # certain set of paths, they know the old names they want to include # too. - with open(os.path.join(reportdir, "renames.txt"), 'w') as f: + with open(os.path.join(reportdir, "renames.txt"), 'bw') as f: seen = set() for pathname,equiv_group in sorted(stats['equivalence'].items(), key=lambda x:(x[1], x[0])): @@ -2455,7 +2455,7 @@ class RepoAnalyze(object): "\n") # List directories in reverse sorted order of unpacked size - with open(os.path.join(reportdir, "directories-deleted-sizes.txt"), 'w') as f: + with open(os.path.join(reportdir, "directories-deleted-sizes.txt"), 'bw') as f: f.write("=== %s ===\n" % _("Deleted directories by reverse size")) f.write(_("Format: unpacked size, packed size, date deleted, directory name\n")) for dirname, size in sorted(dir_size['packed'].items(), @@ -2467,7 +2467,7 @@ class RepoAnalyze(object): datestr(dir_deleted_data[dirname]), dirname or _(''))) - with open(os.path.join(reportdir, "directories-all-sizes.txt"), 'w') as f: + with open(os.path.join(reportdir, "directories-all-sizes.txt"), 'bw') as f: f.write("=== %s ===\n" % _("All directories by reverse size")) f.write(_("Format: unpacked size, packed size, date deleted, directory name\n")) for dirname, size in sorted(dir_size['packed'].items(), @@ -2479,7 +2479,7 @@ class RepoAnalyze(object): dirname or _(""))) # List extensions in reverse sorted order of unpacked size - with open(os.path.join(reportdir, "extensions-deleted-sizes.txt"), 'w') as f: + with open(os.path.join(reportdir, "extensions-deleted-sizes.txt"), 'bw') as f: f.write("=== %s ===\n" % _("Deleted extensions by reverse size")) f.write(_("Format: unpacked size, packed size, date deleted, extension name\n")) for extname, size in sorted(ext_size['packed'].items(), @@ -2491,7 +2491,7 @@ class RepoAnalyze(object): datestr(ext_deleted_data[extname]), extname or _(''))) - with open(os.path.join(reportdir, "extensions-all-sizes.txt"), 'w') as f: + with open(os.path.join(reportdir, "extensions-all-sizes.txt"), 'bw') as f: f.write("=== %s ===\n" % _("All extensions by reverse size")) f.write(_("Format: unpacked size, packed size, date deleted, extension name\n")) for extname, size in sorted(ext_size['packed'].items(), @@ -2503,7 +2503,7 @@ class RepoAnalyze(object): extname or _(''))) # List files in reverse sorted order of unpacked size - with open(os.path.join(reportdir, "path-deleted-sizes.txt"), 'w') as f: + with open(os.path.join(reportdir, "path-deleted-sizes.txt"), 'bw') as f: f.write("=== %s ===\n" % _("Deleted paths by reverse accumulated size")) f.write(_("Format: unpacked size, packed size, date deleted, path name(s)\n")) for pathname, size in sorted(path_size['packed'].items(), @@ -2516,7 +2516,7 @@ class RepoAnalyze(object): datestr(when), pathname)) - with open(os.path.join(reportdir, "path-all-sizes.txt"), 'w') as f: + with open(os.path.join(reportdir, "path-all-sizes.txt"), 'bw') as f: f.write("=== %s ===\n" % _("All paths by reverse accumulated size")) f.write(_("Format: unpacked size, packed size, date deleted, pathectory name\n")) for pathname, size in sorted(path_size['packed'].items(), @@ -2529,7 +2529,7 @@ class RepoAnalyze(object): pathname)) # List of filenames and sizes in descending order - with open(os.path.join(reportdir, "blob-shas-and-paths.txt"), 'w') as f: + with open(os.path.join(reportdir, "blob-shas-and-paths.txt"), 'bw') as f: f.write("=== %s ===\n" % _("Files by sha and associated pathnames in reverse size")) f.write(_("Format: sha, unpacked size, packed size, filename(s) object stored as\n")) for sha, size in sorted(stats['packed_size'].items(), @@ -2735,7 +2735,7 @@ class RepoFilter(object): for root, dirs, files in os.walk(reflog_dir): for filename in files: pathname = os.path.join(root, filename) - with open(pathname) as f: + with open(pathname, 'br') as f: if len(f.read().splitlines()) > 1: shortpath = pathname[len(reflog_dir)+1:] abort(_("expected at most one entry in the reflog for %s") % @@ -2970,7 +2970,7 @@ class RepoFilter(object): if self._args.dry_run or self._args.debug: self._fe_orig = os.path.join(self.results_tmp_dir(), 'fast-export.original') - output = open(self._fe_orig, 'w') + output = open(self._fe_orig, 'bw') self._input = InputFileBackup(self._input, output) if self._args.debug: print("[DEBUG] Running: {}".format(' '.join(fep_cmd))) @@ -2988,7 +2988,7 @@ class RepoFilter(object): if self._args.dry_run or self._args.debug: self._fe_filt = os.path.join(self.results_tmp_dir(), 'fast-export.filtered') - self._output = open(self._fe_filt, 'w') + self._output = open(self._fe_filt, 'bw') else: self._output = self._fip.stdin if self._args.debug: diff --git a/t/t9391/unusual.py b/t/t9391/unusual.py index 684c105..6817c65 100755 --- a/t/t9391/unusual.py +++ b/t/t9391/unusual.py @@ -59,7 +59,7 @@ filter = fr.FastExportFilter('.', everything_callback = track_everything) filter.run(input = sys.stdin.detach(), - output = open(os.devnull, 'wb'), + output = open(os.devnull, 'bw'), fast_import_pipes = None, quiet = True) # DO NOT depend upon or use _IDS directly you external script writers. I'm From 0279e3882d07bba8cba7a81875c6b898fdaedf83 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 27 Apr 2019 12:05:42 -0700 Subject: [PATCH 11/17] filter-repo (python3): error messages should be strings instead of bytes Signed-off-by: Elijah Newren --- git-filter-repo | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/git-filter-repo b/git-filter-repo index e006c5a..60ff51f 100755 --- a/git-filter-repo +++ b/git-filter-repo @@ -265,7 +265,7 @@ class MailmapInfo(object): name_and_email_re = re.compile(r'(.*?)\s*<([^>]+)>\s*') comment_re = re.compile(r'\s*#.*') if not os.access(filename, os.R_OK): - raise SystemExit(_("Cannot read %s") % filename) + raise SystemExit(_("Cannot read %s") % decode(filename)) with open(filename, 'br') as f: count = 0 for line in f: @@ -2450,8 +2450,8 @@ class RepoAnalyze(object): if equiv_group in seen: continue seen.add(equiv_group) - f.write("{} ->\n ".format(equiv_group[0]) + - "\n ".join(equiv_group[1:]) + + f.write("{} ->\n ".format(decode(equiv_group[0])) + + "\n ".join(decode(x) for x in equiv_group[1:]) + "\n") # List directories in reverse sorted order of unpacked size @@ -2565,7 +2565,7 @@ class RepoAnalyze(object): stats = RepoAnalyze.gather_data(args) # Write the reports - sys.stdout.write(_("Writing reports to %s...") % reportdir) + sys.stdout.write(_("Writing reports to %s...") % decode(reportdir)) sys.stdout.flush() RepoAnalyze.write_report(reportdir, stats) sys.stdout.write(_("done.\n")) @@ -2739,7 +2739,7 @@ class RepoFilter(object): if len(f.read().splitlines()) > 1: shortpath = pathname[len(reflog_dir)+1:] abort(_("expected at most one entry in the reflog for %s") % - shortpath) + decode(shortpath)) # Make sure there are no stashed changes if 'refs/stash' in refs: @@ -2761,9 +2761,11 @@ class RepoFilter(object): continue origin_ref = refname.replace('refs/heads/', 'refs/remotes/origin/') if origin_ref not in refs: - abort(_('%s exists, but %s not found') % (refname, origin_ref)) + abort(_('%s exists, but %s not found') % (decode(refname), + decode(origin_ref))) if rev != refs[origin_ref]: - abort(_('%s does not match %s') % (refname, origin_ref)) + abort(_('%s does not match %s') % (decode(refname), + decode(origin_ref))) @staticmethod def tweak_blob(args, blob): @@ -2974,7 +2976,8 @@ class RepoFilter(object): self._input = InputFileBackup(self._input, output) if self._args.debug: print("[DEBUG] Running: {}".format(' '.join(fep_cmd))) - print(" (saving a copy of the output at {})".format(self._fe_orig)) + print(" (saving a copy of the output at {})" + .format(decode(self._fe_orig))) def _setup_output(self): if not self._args.dry_run: @@ -2994,7 +2997,8 @@ class RepoFilter(object): if self._args.debug: self._output = DualFileWriter(self._fip.stdin, self._output) print("[DEBUG] Running: {}".format(' '.join(fip_cmd))) - print(" (using the following file as input: {})".format(self._fe_filt)) + print(" (using the following file as input: {})" + .format(decode(self._fe_filt))) def _migrate_origin_to_heads(self): if self._args.dry_run: @@ -3099,10 +3103,10 @@ class RepoFilter(object): print(_("NOTE: Not running fast-import or cleaning up; --dry-run passed.")) if self._fe_orig: print(_(" Requested filtering can be seen by comparing:")) - print(" " + self._fe_orig) + print(" " + decode(self._fe_orig)) else: print(_(" Requested filtering can be seen at:")) - print(" " + self._fe_filt) + print(" " + decode(self._fe_filt)) return target_working_dir = self._args.target or '.' @@ -3112,7 +3116,7 @@ class RepoFilter(object): if refs_to_nuke: if self._args.debug: print("[DEBUG] Deleting the following refs:\n "+ - "\n ".join(refs_to_nuke)) + decode("\n ".join(refs_to_nuke))) p = subprocess.Popen('git update-ref --stdin'.split(), stdin=subprocess.PIPE, cwd=target_working_dir) From c3072c7f013c409662b6f41e99868d4dd3cc97f8 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 27 Apr 2019 14:10:47 -0700 Subject: [PATCH 12/17] filter-repo (python3): convert StringIO->BytesIO and __str__->__bytes__ Signed-off-by: Elijah Newren --- git-filter-repo | 4 ++-- t/t9391/unusual.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/git-filter-repo b/git-filter-repo index 60ff51f..a76b90f 100755 --- a/git-filter-repo +++ b/git-filter-repo @@ -432,12 +432,12 @@ class _GitElement(object): raise SystemExit(_("Unimplemented function: %s") % type(self).__name__ +".dump()") # pragma: no cover - def __str__(self): + def __bytes__(self): """ Convert GitElement to string; used for debugging """ old_dumped = self.dumped - writeme = io.StringIO() + writeme = io.BytesIO() self.dump(writeme) output_lines = writeme.getvalue().splitlines() writeme.close() diff --git a/t/t9391/unusual.py b/t/t9391/unusual.py index 6817c65..c7a2c57 100755 --- a/t/t9391/unusual.py +++ b/t/t9391/unusual.py @@ -50,7 +50,7 @@ compare = "Blob:\n blob\n mark :1\n data {}\n {}".format(len(mystr), mystr) # debugging git-filter-repo; it is NOT something external folks should depend # upon. myblob = fr.Blob(mystr) -assert str(myblob) == compare +assert bytes(myblob) == compare # Everyone should be using RepoFilter objects, not FastExportFilter. But for # testing purposes... filter = fr.FastExportFilter('.', @@ -71,7 +71,7 @@ print("Found {} blobs/commits and {} other objects" .format(total_objects['common'], total_objects['uncommon'])) -stream = io.StringIO(textwrap.dedent(''' +stream = io.BytesIO(textwrap.dedent(''' blob mark :1 data 5 From ca5818056d20de08db7191eda657f3eff35931e3 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 27 Apr 2019 14:16:31 -0700 Subject: [PATCH 13/17] filter-repo (python3): oct strings in python3 use "0o" instead of "0" Signed-off-by: Elijah Newren --- git-filter-repo | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/git-filter-repo b/git-filter-repo index a76b90f..bd4682a 100755 --- a/git-filter-repo +++ b/git-filter-repo @@ -157,7 +157,7 @@ class PathQuoting: '"': '"', '\\':'\\'} _unescape_re = re.compile(r'\\([a-z"\\]|[0-9]{3})') - _escape = [chr(x) for x in range(127)]+['\\'+oct(x)[1:] for x in range(127,256)] + _escape = [chr(x) for x in range(127)]+['\\'+oct(x)[2:] for x in range(127,256)] _reverse = dict(map(reversed, _unescape.items())) for x in _reverse: _escape[ord(x)] = '\\'+_reverse[x] From 4c05cbe07201728a180c210fd3ec50d98d05fe95 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 27 Apr 2019 14:34:59 -0700 Subject: [PATCH 14/17] filter-repo (python3): bytes() instead of chr() or string join Signed-off-by: Elijah Newren --- git-filter-repo | 5 +++-- t/t9391/splice_repos.py | 2 +- t/t9391/unusual.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/git-filter-repo b/git-filter-repo index bd4682a..b694e8b 100755 --- a/git-filter-repo +++ b/git-filter-repo @@ -157,7 +157,8 @@ class PathQuoting: '"': '"', '\\':'\\'} _unescape_re = re.compile(r'\\([a-z"\\]|[0-9]{3})') - _escape = [chr(x) for x in range(127)]+['\\'+oct(x)[2:] for x in range(127,256)] + _escape = [bytes([x]) for x in range(127)]+[ + '\\'+bytes(ord(c) for c in oct(x)[2:]) for x in range(127,256)] _reverse = dict(map(reversed, _unescape.items())) for x in _reverse: _escape[ord(x)] = '\\'+_reverse[x] @@ -166,7 +167,7 @@ class PathQuoting: @staticmethod def unescape_sequence(orig): seq = orig.group(1) - return PathQuoting._unescape[seq] if len(seq) == 1 else chr(int(seq, 8)) + return PathQuoting._unescape[seq] if len(seq) == 1 else bytes([int(seq, 8)]) @staticmethod def dequote(quoted_string): diff --git a/t/t9391/splice_repos.py b/t/t9391/splice_repos.py index c7834c7..133044e 100755 --- a/t/t9391/splice_repos.py +++ b/t/t9391/splice_repos.py @@ -34,7 +34,7 @@ class InterleaveRepositories: def weave_commit(self, commit): letter = re.match('Commit (.)', commit.message).group(1) - prev_letter = chr(ord(letter)-1) + prev_letter = bytes([ord(letter)-1]) # Splice in any extra commits needed if prev_letter in self.commit_map: diff --git a/t/t9391/unusual.py b/t/t9391/unusual.py index c7a2c57..da0cf89 100755 --- a/t/t9391/unusual.py +++ b/t/t9391/unusual.py @@ -21,7 +21,7 @@ import textwrap import git_filter_repo as fr def handle_progress(progress): - print("Decipher this: "+''.join(reversed(progress.message))) + print("Decipher this: "+bytes(reversed(progress.message))) def handle_checkpoint(checkpoint_object): # Flip a coin; see if we want to pass the checkpoint through. From 12602dae9c199125a79a26afa1883b3f53974e8a Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 27 Apr 2019 14:47:34 -0700 Subject: [PATCH 15/17] filter-repo (python3): f.readline() instead of f.next() and StopIteration File iterators, at least when opened in binary mode, apparently operately differently in python3. Signed-off-by: Elijah Newren --- git-filter-repo | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/git-filter-repo b/git-filter-repo index b694e8b..0348474 100755 --- a/git-filter-repo +++ b/git-filter-repo @@ -2244,25 +2244,23 @@ class RepoAnalyze(object): ' --date=short -M -t -c --raw --combined-all-paths') dtp = subprocess.Popen(cmd, shell=True, bufsize=-1, stdout=subprocess.PIPE) f = dtp.stdout - try: - line = f.next() - except StopIteration: + line = f.readline() + if not line: raise SystemExit(_("Nothing to analyze; repository is empty.")) cont = bool(line) graph = AncestryGraph() while cont: commit = line.rstrip() - parents = f.next().split() - date = f.next().rstrip() + parents = f.readline().split() + date = f.readline().rstrip() # We expect a blank line next; if we get a non-blank line then # this commit modified no files and we need to move on to the next. # If there is no line, we've reached end-of-input. - try: - line = f.next().rstrip() - cont = True - except StopIteration: + line = f.readline() + if not line: cont = False + line = line.rstrip() # If we haven't reached end of input, and we got a blank line meaning # a commit that has modified files, then get the file changes associated From 385b0586ca47109fd14f75dd27a8ed26d831582e Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 27 Apr 2019 15:00:42 -0700 Subject: [PATCH 16/17] filter-repo (python3): bytestr splicing and iterating is different Unlike how str works, if we grab an array index of a bytestr we get an integer (corresponding to the ASCII value) instead of a bytestr of length 1. Adjust code accordingly. Signed-off-by: Elijah Newren --- git-filter-repo | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/git-filter-repo b/git-filter-repo index 0348474..19742a1 100755 --- a/git-filter-repo +++ b/git-filter-repo @@ -181,11 +181,11 @@ class PathQuoting: def enquote(unquoted_string): # Option 1: Quoting when fast-export would: # pqsc = PathQuoting._special_chars - # if any(pqsc[ord(x)] for x in set(unquoted_string)): + # if any(pqsc[x] for x in set(unquoted_string)): # Option 2, perf hack: do minimal amount of quoting required by fast-import if unquoted_string.startswith('"') or '\n' in unquoted_string: pqe = PathQuoting._escape - return '"' + ''.join(pqe[ord(x)] for x in unquoted_string) + '"' + return '"' + ''.join(pqe[x] for x in unquoted_string) + '"' return unquoted_string class AncestryGraph(object): @@ -975,10 +975,10 @@ class FastExportFilter(object): of file-changes that fast-export will provide). """ filechange = None - changetype = self._currentline[0] + changetype = self._currentline[0:1] if changetype == 'M': (changetype, mode, idnum, path) = self._currentline.split(None, 3) - if idnum[0] == ':': + if idnum[0:1] == ':': idnum = idnum[1:] path = path.rstrip('\n') # We translate the idnum to our id system @@ -2136,7 +2136,7 @@ class RepoAnalyze(object): @staticmethod def handle_renames(stats, commit, change_types, filenames): for index, change_type in enumerate(change_types): - if change_type == 'R': + if change_type == ord(b'R'): oldname, newname = filenames[index], filenames[-1] RepoAnalyze.setup_equivalence_for_rename(stats, oldname, newname) RepoAnalyze.setup_or_update_rename_history(stats, commit, @@ -2780,9 +2780,9 @@ class RepoFilter(object): return True n = len(path_expression) if (pathname.startswith(path_expression) and - (path_expression[n-1] == '/' or + (path_expression[n-1:n] == '/' or len(pathname) == n or - pathname[n] == '/')): + pathname[n:n+1] == '/')): return True return False From 35052f673d314eae542926dce393d4b77fe4ff26 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Sat, 27 Apr 2019 15:18:59 -0700 Subject: [PATCH 17/17] filter-repo (python3): replace strings with bytestrings This is by far the largest python3 change; it consists basically of * using b'' instead of '' in lots of places * adding a .encode() if we really do work with a string but need to get it converted to a bytestring * replace uses of .format() with interpolation via the '%' operator, since bytestrings don't have a .format() method. Signed-off-by: Elijah Newren --- git-filter-repo | 636 ++++++++++++++------------- t/t9390-filter-repo.sh | 10 +- t/t9391/commit_info.py | 10 +- t/t9391/create_fast_export_output.py | 94 ++-- t/t9391/file_filter.py | 4 +- t/t9391/rename-master-to-develop.py | 4 +- t/t9391/splice_repos.py | 12 +- t/t9391/strip-cvs-keywords.py | 4 +- t/t9391/unusual.py | 10 +- t/t9392-python-callback.sh | 20 +- 10 files changed, 408 insertions(+), 396 deletions(-) diff --git a/git-filter-repo b/git-filter-repo index 19742a1..e66e27e 100755 --- a/git-filter-repo +++ b/git-filter-repo @@ -87,12 +87,12 @@ class FixedTimeZone(tzinfo): Fixed offset in minutes east from UTC. """ - tz_re = re.compile(r'^([-+]?)(\d\d)(\d\d)$') + tz_re = re.compile(br'^([-+]?)(\d\d)(\d\d)$') def __init__(self, offset_string): tzinfo.__init__(self) sign, hh, mm = FixedTimeZone.tz_re.match(offset_string).groups() - factor = -1 if (sign and sign == '-') else 1 + factor = -1 if (sign and sign == b'-') else 1 self._offset = timedelta(minutes = factor*(60*int(hh) + int(mm))) self._offset_string = offset_string @@ -112,8 +112,8 @@ def string_to_date(datestring): def date_to_string(dateobj): epoch = datetime.fromtimestamp(0, dateobj.tzinfo) - return('{} {}'.format(int(_timedelta_to_seconds(dateobj - epoch)), - dateobj.tzinfo.tzname(0))) + return(b'%d %s' % (int(_timedelta_to_seconds(dateobj - epoch)), + dateobj.tzinfo.tzname(0))) def decode(bytestr): 'Try to convert bytestr to utf-8 for outputting as an error message.' @@ -147,21 +147,21 @@ def glob_to_regex(glob_bytestr): return regex.encode() class PathQuoting: - _unescape = {'a': '\a', - 'b': '\b', - 'f': '\f', - 'n': '\n', - 'r': '\r', - 't': '\t', - 'v': '\v', - '"': '"', - '\\':'\\'} - _unescape_re = re.compile(r'\\([a-z"\\]|[0-9]{3})') + _unescape = {b'a': b'\a', + b'b': b'\b', + b'f': b'\f', + b'n': b'\n', + b'r': b'\r', + b't': b'\t', + b'v': b'\v', + b'"': b'"', + b'\\':b'\\'} + _unescape_re = re.compile(br'\\([a-z"\\]|[0-9]{3})') _escape = [bytes([x]) for x in range(127)]+[ - '\\'+bytes(ord(c) for c in oct(x)[2:]) for x in range(127,256)] + b'\\'+bytes(ord(c) for c in oct(x)[2:]) for x in range(127,256)] _reverse = dict(map(reversed, _unescape.items())) for x in _reverse: - _escape[ord(x)] = '\\'+_reverse[x] + _escape[ord(x)] = b'\\'+_reverse[x] _special_chars = [len(x) > 1 for x in _escape] @staticmethod @@ -171,8 +171,8 @@ class PathQuoting: @staticmethod def dequote(quoted_string): - if quoted_string.startswith('"'): - assert quoted_string.endswith('"') + if quoted_string.startswith(b'"'): + assert quoted_string.endswith(b'"') return PathQuoting._unescape_re.sub(PathQuoting.unescape_sequence, quoted_string[1:-1]) return quoted_string @@ -183,9 +183,9 @@ class PathQuoting: # pqsc = PathQuoting._special_chars # if any(pqsc[x] for x in set(unquoted_string)): # Option 2, perf hack: do minimal amount of quoting required by fast-import - if unquoted_string.startswith('"') or '\n' in unquoted_string: + if unquoted_string.startswith(b'"') or b'\n' in unquoted_string: pqe = PathQuoting._escape - return '"' + ''.join(pqe[x] for x in unquoted_string) + '"' + return b'"' + b''.join(pqe[x] for x in unquoted_string) + b'"' return unquoted_string class AncestryGraph(object): @@ -263,8 +263,8 @@ class MailmapInfo(object): self._parse_file(filename) def _parse_file(self, filename): - name_and_email_re = re.compile(r'(.*?)\s*<([^>]+)>\s*') - comment_re = re.compile(r'\s*#.*') + name_and_email_re = re.compile(br'(.*?)\s*<([^>]+)>\s*') + comment_re = re.compile(br'\s*#.*') if not os.access(filename, os.R_OK): raise SystemExit(_("Cannot read %s") % decode(filename)) with open(filename, 'br') as f: @@ -273,7 +273,7 @@ class MailmapInfo(object): count += 1 err = "Unparseable mailmap file: line #{} is bad: {}".format(count, line) # Remove comments - line = comment_re.sub('', line) + line = comment_re.sub(b'', line) # Remove leading and trailing whitespace line = line.strip() if not line: @@ -443,7 +443,8 @@ class _GitElement(object): output_lines = writeme.getvalue().splitlines() writeme.close() self.dumped = old_dumped - return "{}:\n {}".format(type(self).__name__, "\n ".join(output_lines)) + return b"%s:\n %s" % (type(self).__name__.encode(), + b"\n ".join(output_lines)) def skip(self, new_id=None): """ @@ -491,6 +492,7 @@ class Blob(_GitElementWithId): self.original_id = original_id # Stores the blob's data + assert(type(data) == bytes) self.data = data def dump(self, file_): @@ -499,10 +501,10 @@ class Blob(_GitElementWithId): """ self.dumped = 1 - file_.write('blob\n') - file_.write('mark :%d\n' % self.id) - file_.write('data %d\n%s' % (len(self.data), self.data)) - file_.write('\n') + file_.write(b'blob\n') + file_.write(b'mark :%d\n' % self.id) + file_.write(b'data %d\n%s' % (len(self.data), self.data)) + file_.write(b'\n') class Reset(_GitElement): @@ -530,10 +532,10 @@ class Reset(_GitElement): """ self.dumped = 1 - file_.write('reset %s\n' % self.ref) + file_.write(b'reset %s\n' % self.ref) if self.from_ref: - file_.write('from :%d\n' % self.from_ref) - file_.write('\n') + file_.write(b'from :%d\n' % self.from_ref) + file_.write(b'\n') class FileChanges(_GitElement): """ @@ -544,7 +546,10 @@ class FileChanges(_GitElement): def __init__(self, type_, filename, id_ = None, mode = None): _GitElement.__init__(self) - # Denote the type of file-change (M for modify, D for delete, etc) + # Denote the type of file-change (b'M' for modify, b'D' for delete, etc) + # We could + # assert(type(type_) == bytes) + # here but I don't just due to worries about performance overhead... self.type = type_ # Record the name of the file being changed @@ -557,15 +562,15 @@ class FileChanges(_GitElement): # blob_id is the id (mark) of the affected blob self.blob_id = None - # For 'M' file changes (modify), expect to have id and mode - if type_ == 'M': + # For b'M' file changes (modify), expect to have id and mode + if type_ == b'M': if mode is None: raise SystemExit(_("file mode and idnum needed for %s") % filename) # pragma: no cover self.mode = mode self.blob_id = id_ - # For 'R' file changes (rename), expect to have newname as third arg - elif type_ == 'R': # pragma: no cover (now avoid fast-export renames) + # For b'R' file changes (rename), expect to have newname as third arg + elif type_ == b'R': # pragma: no cover (now avoid fast-export renames) if id_ is None: raise SystemExit(_("new name needed for rename of %s") % filename) self.filename = (self.filename, id_) @@ -574,17 +579,17 @@ class FileChanges(_GitElement): """ Write this file-change element to a file """ - skipped_blob = (self.type == 'M' and self.blob_id is None) + skipped_blob = (self.type == b'M' and self.blob_id is None) if skipped_blob: return self.dumped = 1 quoted_filename = PathQuoting.enquote(self.filename) - if self.type == 'M' and isinstance(self.blob_id, int): - file_.write('M %s :%d %s\n' % (self.mode, self.blob_id, quoted_filename)) - elif self.type == 'M': - file_.write('M %s %s %s\n' % (self.mode, self.blob_id, quoted_filename)) - elif self.type == 'D': - file_.write('D %s\n' % quoted_filename) + if self.type == b'M' and isinstance(self.blob_id, int): + file_.write(b'M %s :%d %s\n' % (self.mode, self.blob_id, quoted_filename)) + elif self.type == b'M': + file_.write(b'M %s %s %s\n' % (self.mode, self.blob_id, quoted_filename)) + elif self.type == b'D': + file_.write(b'D %s\n' % quoted_filename) else: raise SystemExit(_("Unhandled filechange type: %s") % self.type) # pragma: no cover @@ -648,16 +653,16 @@ class Commit(_GitElementWithId): # Make output to fast-import slightly easier for humans to read if the # message has no trailing newline of its own; cosmetic, but a nice touch... - extra_newline = '\n' - if self.message.endswith('\n') or not (self.parents or self.file_changes): - extra_newline = '' + extra_newline = b'\n' + if self.message.endswith(b'\n') or not (self.parents or self.file_changes): + extra_newline = b'' - file_.write(('commit {}\n' - 'mark :{}\n' - 'author {} <{}> {}\n' - 'committer {} <{}> {}\n' - 'data {}\n{}{}' - ).format( + file_.write((b'commit %s\n' + b'mark :%d\n' + b'author %s <%s> %s\n' + b'committer %s <%s> %s\n' + b'data %d\n%s%s' + ) % ( self.branch, self.id, self.author_name, self.author_email, self.author_date, self.committer_name, self.committer_email, self.committer_date, @@ -665,16 +670,18 @@ class Commit(_GitElementWithId): extra_newline) ) for i, parent in enumerate(self.parents): - mark = ':' if isinstance(parent, int) else '' - file_.write('from ' if i==0 else 'merge ') - file_.write('{}{}\n'.format(mark, parent)) + file_.write(b'from ' if i==0 else b'merge ') + if isinstance(parent, int): + file_.write(b':%d\n' % parent) + else: + file_.write(b'%s\n' % parent) for change in self.file_changes: change.dump(file_) if not self.parents and not self.file_changes: # Workaround a bug in pre-git-2.22 versions of fast-import with # the get-mark directive. - file_.write('\n') - file_.write('\n') + file_.write(b'\n') + file_.write(b'\n') def first_parent(self): """ @@ -729,15 +736,15 @@ class Tag(_GitElement): self.dumped = 1 - file_.write('tag %s\n' % self.ref) - mark = ':' if isinstance(self.from_ref, int) else '' - file_.write('from {}{}\n'.format(mark, self.from_ref)) + file_.write(b'tag %s\n' % self.ref) + markfmt = b'from :%d\n' if isinstance(self.from_ref, int) else 'from %s\n' + file_.write(markfmt % self.from_ref) if self.tagger_name: - file_.write('tagger %s <%s> ' % (self.tagger_name, self.tagger_email)) + file_.write(b'tagger %s <%s> ' % (self.tagger_name, self.tagger_email)) file_.write(self.tagger_date) - file_.write('\n') - file_.write('data %d\n%s' % (len(self.message), self.message)) - file_.write('\n') + file_.write(b'\n') + file_.write(b'data %d\n%s' % (len(self.message), self.message)) + file_.write(b'\n') class Progress(_GitElement): """ @@ -761,8 +768,8 @@ class Progress(_GitElement): """ self.dumped = 1 - file_.write('progress %s\n' % self.message) - file_.write('\n') + file_.write(b'progress %s\n' % self.message) + file_.write(b'\n') class Checkpoint(_GitElement): """ @@ -784,8 +791,8 @@ class Checkpoint(_GitElement): """ self.dumped = 1 - file_.write('checkpoint\n') - file_.write('\n') + file_.write(b'checkpoint\n') + file_.write(b'\n') class LiteralCommand(_GitElement): """ @@ -910,20 +917,20 @@ class FastExportFilter(object): self._files_tweaked = set() # Compile some regexes and cache those - self._mark_re = re.compile(r'mark :(\d+)\n$') + self._mark_re = re.compile(br'mark :(\d+)\n$') self._parent_regexes = {} - parent_regex_rules = ('{} :(\d+)\n$', '{} ([0-9a-f]{{40}})\n') - for parent_refname in ('from', 'merge'): - ans = [re.compile(x.format(parent_refname)) for x in parent_regex_rules] + parent_regex_rules = (b' :(\d+)\n$', b' ([0-9a-f]{40})\n') + for parent_refname in (b'from', b'merge'): + ans = [re.compile(parent_refname+x) for x in parent_regex_rules] self._parent_regexes[parent_refname] = ans - self._quoted_string_re = re.compile(r'"(?:[^"\\]|\\.)*"') + self._quoted_string_re = re.compile(br'"(?:[^"\\]|\\.)*"') self._refline_regexes = {} - for refline_name in ('reset', 'commit', 'tag', 'progress'): - self._refline_regexes[refline_name] = re.compile(refline_name+' (.*)\n$') + for refline_name in (b'reset', b'commit', b'tag', b'progress'): + self._refline_regexes[refline_name] = re.compile(refline_name+b' (.*)\n$') self._user_regexes = {} - for user in ('author', 'committer', 'tagger'): - self._user_regexes[user] = re.compile(user + ' (.*?) <(.*?)> (.*)\n$') - self._hash_re = re.compile(r'(\b[0-9a-f]{7,40}\b)') + for user in (b'author', b'committer', b'tagger'): + self._user_regexes[user] = re.compile(user + b' (.*?) <(.*?)> (.*)\n$') + self._hash_re = re.compile(br'(\b[0-9a-f]{7,40}\b)') def _advance_currentline(self): """ @@ -971,51 +978,51 @@ class FastExportFilter(object): """ If the current line contains a file-change object, then parse it and advance the current line; otherwise return None. We only care - about file changes of type 'M' and 'D' (these are the only types + about file changes of type b'M' and b'D' (these are the only types of file-changes that fast-export will provide). """ filechange = None changetype = self._currentline[0:1] - if changetype == 'M': + if changetype == b'M': (changetype, mode, idnum, path) = self._currentline.split(None, 3) - if idnum[0:1] == ':': + if idnum[0:1] == b':': idnum = idnum[1:] - path = path.rstrip('\n') + path = path.rstrip(b'\n') # We translate the idnum to our id system if len(idnum) != 40: idnum = _IDS.translate( int(idnum) ) if idnum is not None: - if path.startswith('"'): + if path.startswith(b'"'): path = PathQuoting.dequote(path) - filechange = FileChanges('M', path, idnum, mode) + filechange = FileChanges(b'M', path, idnum, mode) else: - filechange = 'skipped' + filechange = b'skipped' self._advance_currentline() - elif changetype == 'D': + elif changetype == b'D': (changetype, path) = self._currentline.split(None, 1) - path = path.rstrip('\n') - if path.startswith('"'): + path = path.rstrip(b'\n') + if path.startswith(b'"'): path = PathQuoting.dequote(path) - filechange = FileChanges('D', path) + filechange = FileChanges(b'D', path) self._advance_currentline() - elif changetype == 'R': # pragma: no cover (now avoid fast-export renames) + elif changetype == b'R': # pragma: no cover (now avoid fast-export renames) rest = self._currentline[2:-1] - if rest.startswith('"'): + if rest.startswith(b'"'): m = self._quoted_string_re.match(rest) if not m: raise SystemExit(_("Couldn't parse rename source")) orig = PathQuoting.dequote(m.group(0)) new = rest[m.end()+1:] else: - orig, new = rest.split(' ', 1) - if new.startswith('"'): + orig, new = rest.split(b' ', 1) + if new.startswith(b'"'): new = PathQuoting.dequote(new) - filechange = FileChanges('R', orig, new) + filechange = FileChanges(b'R', orig, new) self._advance_currentline() return filechange def _parse_original_id(self): - original_id = self._currentline[len('original-oid '):].rstrip() + original_id = self._currentline[len(b'original-oid '):].rstrip() self._advance_currentline() return original_id @@ -1049,8 +1056,8 @@ class FastExportFilter(object): # fast-import will not choke on. Let's do that. Note that +051800 # seems to be the only weird timezone found in the wild, by me or some # other posts google returned on the subject... - if when.endswith('+051800'): - when = when[0:-7]+'+0261' + if when.endswith(b'+051800'): + when = when[0:-7]+b'+0261' self._advance_currentline() return (name, email, when) @@ -1061,11 +1068,11 @@ class FastExportFilter(object): the data. """ fields = self._currentline.split() - assert fields[0] == 'data' + assert fields[0] == b'data' size = int(fields[1]) data = self._input.read(size) self._advance_currentline() - if self._currentline == '\n': + if self._currentline == b'\n': self._advance_currentline() return data @@ -1082,11 +1089,11 @@ class FastExportFilter(object): id_ = self._parse_optional_mark() original_id = None - if self._currentline.startswith('original-oid'): + if self._currentline.startswith(b'original-oid'): original_id = self._parse_original_id(); data = self._parse_data() - if self._currentline == '\n': + if self._currentline == b'\n': self._advance_currentline() # Create the blob @@ -1117,9 +1124,9 @@ class FastExportFilter(object): the callback). """ # Parse the Reset - ref = self._parse_ref_line('reset') - ignoreme, from_ref = self._parse_optional_parent_ref('from') - if self._currentline == '\n': + ref = self._parse_ref_line(b'reset') + ignoreme, from_ref = self._parse_optional_parent_ref(b'from') + if self._currentline == b'\n': self._advance_currentline() # fast-export likes to print extraneous resets that serve no purpose. @@ -1342,19 +1349,19 @@ class FastExportFilter(object): for change in commit.file_changes: parent = new_1st_parent or commit.parents[0] # exists due to above checks quoted_filename = PathQuoting.enquote(change.filename) - self._output.write("ls :{} {}\n".format(parent, quoted_filename)) + self._output.write(b"ls :%d %s\n" % (parent, quoted_filename)) self._output.flush() parent_version = fi_output.readline().split() - if change.type == 'D': - if parent_version != ['missing', quoted_filename]: + if change.type == b'D': + if parent_version != [b'missing', quoted_filename]: return False else: blob_sha = change.blob_id if isinstance(change.blob_id, int): - self._output.write("get-mark :{}\n".format(change.blob_id)) + self._output.write(b"get-mark :%d\n" % change.blob_id) self._output.flush() blob_sha = fi_output.readline().rstrip() - if parent_version != [change.mode, 'blob', blob_sha, quoted_filename]: + if parent_version != [change.mode, b'blob', blob_sha, quoted_filename]: return False return True @@ -1364,7 +1371,7 @@ class FastExportFilter(object): # Record the mapping of old commit hash to new one if commit.original_id and self._fast_import_pipes: fi_input, fi_output = self._fast_import_pipes - self._output.write("get-mark :{}\n".format(commit.id)) + self._output.write(b"get-mark :%d\n" % commit.id) self._output.flush() orig_id = commit.original_id self._commit_short_old_hashes[orig_id[0:7]].add(orig_id) @@ -1390,19 +1397,19 @@ class FastExportFilter(object): """ # Parse the Commit. This may look involved, but it's pretty simple; it only # looks bad because a commit object contains many pieces of data. - branch = self._parse_ref_line('commit') + branch = self._parse_ref_line(b'commit') id_ = self._parse_optional_mark() original_id = None - if self._currentline.startswith('original-oid'): + if self._currentline.startswith(b'original-oid'): original_id = self._parse_original_id(); author_name = None - if self._currentline.startswith('author'): - (author_name, author_email, author_date) = self._parse_user('author') + if self._currentline.startswith(b'author'): + (author_name, author_email, author_date) = self._parse_user(b'author') (committer_name, committer_email, committer_date) = \ - self._parse_user('committer') + self._parse_user(b'committer') if not author_name: (author_name, author_email, author_date) = \ @@ -1411,12 +1418,12 @@ class FastExportFilter(object): commit_msg = self._parse_data() commit_msg = self._hash_re.sub(self._translate_commit_hash, commit_msg) - pinfo = [self._parse_optional_parent_ref('from')] + pinfo = [self._parse_optional_parent_ref(b'from')] # Due to empty pruning, we can have real 'from' and 'merge' lines that # due to commit rewriting map to a parent of None. We need to record # 'from' if its non-None, and we need to parse all 'merge' lines. - while self._currentline.startswith('merge '): - pinfo.append(self._parse_optional_parent_ref('merge')) + while self._currentline.startswith(b'merge '): + pinfo.append(self._parse_optional_parent_ref(b'merge')) orig_parents, parents = [list(tmp) for tmp in zip(*pinfo)] # No parents is oddly represented as [None] instead of [], due to the @@ -1434,10 +1441,10 @@ class FastExportFilter(object): file_change = self._parse_optional_filechange() had_file_changes = file_change is not None while file_change: - if not (type(file_change) == str and file_change == 'skipped'): + if not (type(file_change) == bytes and file_change == b'skipped'): file_changes.append(file_change) file_change = self._parse_optional_filechange() - if self._currentline == '\n': + if self._currentline == b'\n': self._advance_currentline() # Okay, now we can finally create the Commit object @@ -1510,18 +1517,18 @@ class FastExportFilter(object): the callback). """ # Parse the Tag - tag = self._parse_ref_line('tag') - ignoreme, from_ref = self._parse_optional_parent_ref('from') + tag = self._parse_ref_line(b'tag') + ignoreme, from_ref = self._parse_optional_parent_ref(b'from') original_id = None - if self._currentline.startswith('original-oid'): + if self._currentline.startswith(b'original-oid'): original_id = self._parse_original_id(); tagger_name, tagger_email, tagger_date = None, None, None - if self._currentline.startswith('tagger'): - (tagger_name, tagger_email, tagger_date) = self._parse_user('tagger') + if self._currentline.startswith(b'tagger'): + (tagger_name, tagger_email, tagger_date) = self._parse_user(b'tagger') tag_msg = self._parse_data() - if self._currentline == '\n': + if self._currentline == b'\n': self._advance_currentline() # Create the tag @@ -1544,7 +1551,7 @@ class FastExportFilter(object): tag.dump(self._output) # Record the fact that this tag was seen so we don't nuke it as part # of refs_to_nuke. - full_ref = 'refs/tags/{}'.format(tag.ref) + full_ref = b'refs/tags/' + tag.ref self._seen_refs[full_ref] = None def _parse_progress(self): @@ -1556,8 +1563,8 @@ class FastExportFilter(object): everything else is done (unless it has been skipped by the callback). """ # Parse the Progress - message = self._parse_ref_line('progress') - if self._currentline == '\n': + message = self._parse_ref_line(b'progress') + if self._currentline == b'\n': self._advance_currentline() # Create the progress message @@ -1585,7 +1592,7 @@ class FastExportFilter(object): """ # Parse the Checkpoint self._advance_currentline() - if self._currentline == '\n': + if self._currentline == b'\n': self._advance_currentline() # Create the checkpoint @@ -1632,16 +1639,17 @@ class FastExportFilter(object): reset.dump(self._output) def record_metadata(self, metadata_dir, orig_refs, refs_nuked): - deleted_hash = '0'*40 + deleted_hash = b'0'*40 self._flush_renames() - with open(os.path.join(metadata_dir, 'commit-map'), 'bw') as f: - f.write("%-40s %s\n" % (_("old"), _("new"))) + with open(os.path.join(metadata_dir, b'commit-map'), 'bw') as f: + f.write(("%-40s %s\n" % (_("old"), _("new"))).encode()) for (old,new) in self._commit_renames.items(): - f.write('{} {}\n'.format(old, new if new != None else deleted_hash)) + msg = b'%s %s\n' % (old, new if new != None else deleted_hash) + f.write(msg) batch_check_process = None - batch_check_output_re = re.compile('^([0-9a-f]{40}) ([a-z]+) ([0-9]+)$') - with open(os.path.join(metadata_dir, 'ref-map'), 'bw') as f: + batch_check_output_re = re.compile(b'^([0-9a-f]{40}) ([a-z]+) ([0-9]+)$') + with open(os.path.join(metadata_dir, b'ref-map'), 'bw') as f: for refname, old_hash in orig_refs.items(): if refname in refs_nuked: new_hash = deleted_hash @@ -1655,22 +1663,22 @@ class FastExportFilter(object): stdin=subprocess.PIPE, stdout=subprocess.PIPE, cwd=self._repo_working_dir) - batch_check_process.stdin.write(refname+"\n") + batch_check_process.stdin.write(refname+b"\n") batch_check_process.stdin.flush() line = batch_check_process.stdout.readline() m = batch_check_output_re.match(line) - if not m or m.group(2) != 'tag': + if not m or m.group(2) != b'tag': raise SystemExit(_("Failed to find new id for %(refname)s " "(old id was %(old_hash)s)") % ({'refname': refname, 'old_hash': old_hash}) ) # pragma: no cover new_hash = m.group(1) - f.write('{} {} {}\n'.format(old_hash, new_hash, refname)) + f.write(b'%s %s %s\n' % (old_hash, new_hash, refname)) if batch_check_process: batch_check_process.stdin.close() batch_check_process.wait() - with open(os.path.join(metadata_dir, 'suboptimal-issues'), 'bw') as f: + with open(os.path.join(metadata_dir, b'suboptimal-issues'), 'bw') as f: issues_found = False if self._commits_no_longer_merges: issues_found = True @@ -1680,10 +1688,10 @@ class FastExportFilter(object): are now regular commits; they likely have suboptimal commit messages (e.g. "Merge branch next into master"). Original commit hash on the left, commit hash after filtering/rewriting on the right: - ''')[1:])) + ''')[1:]).encode()) for oldhash, newhash in self._commits_no_longer_merges: - f.write(' {} {}\n'.format(oldhash, newhash)) - f.write('\n') + f.write(' {} {}\n'.format(oldhash, newhash).encode()) + f.write(b'\n') if self._commits_referenced_but_removed: issues_found = True @@ -1691,16 +1699,16 @@ class FastExportFilter(object): The following commits were filtered out, but referenced in another commit message. The reference to the now-nonexistent commit hash (or a substring thereof) was left as-is in any commit messages: - ''')[1:])) + ''')[1:]).encode()) for bad_commit_reference in self._commits_referenced_but_removed: - f.write(' {}\n'.format(bad_commit_reference)) - f.write('\n') + f.write(' {}\n'.format(bad_commit_reference).encode()) + f.write(b'\n') if not issues_found: - f.write(_("No filtering problems encountered.")) + f.write(_("No filtering problems encountered.\n").encode()) - with open(os.path.join(metadata_dir, 'already_ran'), 'bw') as f: - f.write(_("This file exists to allow you to filter again without --force.")) + with open(os.path.join(metadata_dir, b'already_ran'), 'bw') as f: + f.write(_("This file exists to allow you to filter again without --force.\n").encode()) def get_seen_refs(self): return self._seen_refs.keys() @@ -1718,30 +1726,30 @@ class FastExportFilter(object): # Run over the input and do the filtering self._advance_currentline() while self._currentline: - if self._currentline.startswith('blob'): + if self._currentline.startswith(b'blob'): self._parse_blob() - elif self._currentline.startswith('reset'): + elif self._currentline.startswith(b'reset'): self._parse_reset() - elif self._currentline.startswith('commit'): + elif self._currentline.startswith(b'commit'): self._parse_commit() - elif self._currentline.startswith('tag'): + elif self._currentline.startswith(b'tag'): self._parse_tag() - elif self._currentline.startswith('progress'): + elif self._currentline.startswith(b'progress'): self._parse_progress() - elif self._currentline.startswith('checkpoint'): + elif self._currentline.startswith(b'checkpoint'): self._parse_checkpoint() - elif self._currentline.startswith('feature'): + elif self._currentline.startswith(b'feature'): self._parse_literal_command() - elif self._currentline.startswith('option'): + elif self._currentline.startswith(b'option'): self._parse_literal_command() - elif self._currentline.startswith('done'): + elif self._currentline.startswith(b'done'): self._handle_final_commands() self._parse_literal_command() - elif self._currentline.startswith('#'): + elif self._currentline.startswith(b'#'): self._parse_literal_command() - elif self._currentline.startswith('get-mark') or \ - self._currentline.startswith('cat-blob') or \ - self._currentline.startswith('ls'): + elif self._currentline.startswith(b'get-mark') or \ + self._currentline.startswith(b'cat-blob') or \ + self._currentline.startswith(b'ls'): raise SystemExit(_("Unsupported command: '%s'") % self._currentline) else: raise SystemExit(_("Could not parse line: '%s'") % self._currentline) @@ -1798,13 +1806,13 @@ class GitUtils(object): def is_repository_bare(repo_working_dir): out = subprocess.check_output('git rev-parse --is-bare-repository'.split(), cwd=repo_working_dir) - return (out.strip() == 'true') + return (out.strip() == b'true') @staticmethod def determine_git_dir(repo_working_dir): d = subprocess.check_output('git rev-parse --git-dir'.split(), cwd=repo_working_dir).strip() - if repo_working_dir=='.' or d.startswith('/'): + if repo_working_dir==b'.' or d.startswith(b'/'): return d return os.path.join(repo_working_dir, d) @@ -1841,12 +1849,12 @@ class FilteringOptions(object): def __call__(self, parser, namespace, values, option_string=None): af = FilteringOptions.AppendFilter(dest='path_changes', option_strings=None) - dirname = values if values[-1] == '/' else values+'/' + dirname = values if values[-1] == b'/' else values+b'/' if option_string == '--subdirectory-filter': af(parser, namespace, dirname, '--path-match') - af(parser, namespace, dirname+':', '--path-rename') + af(parser, namespace, dirname+b':', '--path-rename') elif option_string == '--to-subdirectory-filter': - af(parser, namespace, ':'+dirname, '--path-rename') + af(parser, namespace, b':'+dirname, '--path-rename') else: raise SystemExit(_("Error: HelperFilter given invalid option_string: %s") % option_string) # pragma: no cover @@ -2047,7 +2055,7 @@ class FilteringOptions(object): stdout=subprocess.PIPE, stderr=subprocess.STDOUT) p.wait() output = p.stdout.read() - if '--combined-all-paths' not in output: + if b'--combined-all-paths' not in output: raise SystemExit(_("Error: need a version of git whose diff-tree command " "has the --combined-all-paths option")) # pragma: no cover @@ -2057,24 +2065,24 @@ class FilteringOptions(object): replace_regexes = [] with open(filename, 'br') as f: for line in f: - line = line.rstrip('\r\n') + line = line.rstrip(b'\r\n') # Determine the replacement - replacement = '***REMOVED***' - if '==>' in line: - line, replacement = line.rsplit('==>', 1) + replacement = b'***REMOVED***' + if b'==>' in line: + line, replacement = line.rsplit(b'==>', 1) # See if we need to match via regex regex = None - if line.startswith('regex:'): + if line.startswith(b'regex:'): regex = line[6:] - elif line.startswith('glob:'): + elif line.startswith(b'glob:'): regex = glob_to_regex(line[5:]) if regex: replace_regexes.append((re.compile(regex), replacement)) else: # Otherwise, find the literal we need to replace - if line.startswith('literal:'): + if line.startswith(b'literal:'): line = line[8:] if not line: continue @@ -2149,7 +2157,7 @@ class RepoAnalyze(object): # Figure out kind of deletions to undo for this file, and update lists # of all-names-by-sha and all-filenames delmode = 'tree_deletions' - if mode != '040000': + if mode != b'040000': delmode = 'file_deletions' stats['names'][sha].add(filename) stats['allnames'].add(filename) @@ -2179,22 +2187,22 @@ class RepoAnalyze(object): graph.add_commit_and_parents(commit, parents) for change in file_changes: modes, shas, change_types, filenames = change - if len(parents) == 1 and change_types.startswith('R'): - change_types = 'R' # remove the rename score; we don't care - if modes[-1] == '160000': + if len(parents) == 1 and change_types.startswith(b'R'): + change_types = b'R' # remove the rename score; we don't care + if modes[-1] == b'160000': continue - elif modes[-1] == '000000': + elif modes[-1] == b'000000': # Track when files/directories are deleted for f in RepoAnalyze.equiv_class(stats, filenames[-1]): - if any(x == '040000' for x in modes[0:-1]): + if any(x == b'040000' for x in modes[0:-1]): stats['tree_deletions'][f] = date else: stats['file_deletions'][f] = date - elif change_types.strip('AMT') == '': + elif change_types.strip(b'AMT') == b'': RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames) - elif modes[-1] == '040000' and change_types.strip('RAM') == '': + elif modes[-1] == b'040000' and change_types.strip(b'RAM') == b'': RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames) - elif change_types.strip('RAM') == '': + elif change_types.strip(b'RAM') == b'': RepoAnalyze.handle_file(stats, graph, commit, modes, shas, filenames) RepoAnalyze.handle_renames(stats, commit, change_types, filenames) else: @@ -2219,7 +2227,7 @@ class RepoAnalyze(object): for line in cf.stdout: sha, objtype, objsize, objdisksize = line.split() objsize, objdisksize = int(objsize), int(objdisksize) - if objtype == 'blob': + if objtype == b'blob': unpacked_size[sha] = objsize packed_size[sha] = objdisksize num_blobs += 1 @@ -2269,17 +2277,17 @@ class RepoAnalyze(object): if cont and not line: cont = False for line in f: - if not line.startswith(':'): + if not line.startswith(b':'): cont = True break n = 1+max(1, len(parents)) - assert line.startswith(':'*(n-1)) + assert line.startswith(b':'*(n-1)) relevant = line[n-1:-1] splits = relevant.split(None, n) modes = splits[0:n] splits = splits[n].split(None, n) shas = splits[0:n] - splits = splits[n].split('\t') + splits = splits[n].split(b'\t') change_types = splits[0] filenames = [PathQuoting.dequote(x) for x in splits[1:]] file_changes.append([modes, shas, change_types, filenames]) @@ -2304,13 +2312,13 @@ class RepoAnalyze(object): @staticmethod def write_report(reportdir, stats): def datestr(datetimestr): - return datetimestr if datetimestr else _('') + return datetimestr if datetimestr else _('').encode() def dirnames(path): while True: path = os.path.dirname(path) yield path - if path == '': + if path == b'': break # Compute aggregate size information for paths, extensions, and dirs @@ -2352,27 +2360,27 @@ class RepoAnalyze(object): for name in dir_size['packed']: dir_deleted_data[name] = stats['tree_deletions'].get(name, None) - with open(os.path.join(reportdir, "README"), 'bw') as f: + with open(os.path.join(reportdir, b"README"), 'bw') as f: # Give a basic overview of this file - f.write("== %s ==\n" % _("Overall Statistics")) - f.write(" %s: %d\n" % (_("Number of commits"), - stats['num_commits'])) - f.write(" %s: %d\n" % (_("Number of filenames"), - len(path_size['packed']))) - f.write(" %s: %d\n" % (_("Number of directories"), - len(dir_size['packed']))) - f.write(" %s: %d\n" % (_("Number of file extensions"), - len(ext_size['packed']))) - f.write("\n") - f.write(" %s: %d\n" % (_("Total unpacked size (bytes)"), - total_size['unpacked'])) - f.write(" %s: %d\n" % (_("Total packed size (bytes)"), - total_size['packed'])) - f.write("\n") + f.write(b"== %s ==\n" % _("Overall Statistics").encode()) + f.write((" %s: %d\n" % (_("Number of commits"), + stats['num_commits'])).encode()) + f.write((" %s: %d\n" % (_("Number of filenames"), + len(path_size['packed']))).encode()) + f.write((" %s: %d\n" % (_("Number of directories"), + len(dir_size['packed']))).encode()) + f.write((" %s: %d\n" % (_("Number of file extensions"), + len(ext_size['packed']))).encode()) + f.write(b"\n") + f.write((" %s: %d\n" % (_("Total unpacked size (bytes)"), + total_size['unpacked'])).encode()) + f.write((" %s: %d\n" % (_("Total packed size (bytes)"), + total_size['packed'])).encode()) + f.write(b"\n") # Mention issues with the report - f.write("== %s ==\n" % _("Caveats")) - f.write("=== %s ===\n" % _("Sizes")) + f.write(("== %s ==\n" % _("Caveats")).encode()) + f.write(("=== %s ===\n" % _("Sizes")).encode()) f.write(textwrap.dedent(_(""" Packed size represents what size your repository would be if no trees, commits, tags, or other metadata were included (though it may @@ -2400,9 +2408,9 @@ class RepoAnalyze(object): ever reverted to a previous version's contents, the previous version's size will be counted multiple times in this analysis, even though git will only store it once. - """)[1:])) - f.write("\n") - f.write("=== %s ===\n" % _("Deletions")) + """)[1:]).encode()) + f.write(b"\n") + f.write(("=== %s ===\n" % _("Deletions")).encode()) f.write(textwrap.dedent(_(""" Whether a file is deleted is not a binary quality, since it can be deleted on some branches but still exist in others. Also, it might @@ -2418,9 +2426,9 @@ class RepoAnalyze(object): stream that mentions the file lists it as deleted. This makes it dependent on topological ordering, but generally gives the "right" answer. - """)[1:])) - f.write("\n") - f.write("=== %s ===\n" % _("Renames")) + """)[1:]).encode()) + f.write(b"\n") + f.write(("=== %s ===\n" % _("Renames")).encode()) f.write(textwrap.dedent(_(""" Renames share the same non-binary nature that deletions do, plus additional challenges: @@ -2436,101 +2444,105 @@ class RepoAnalyze(object): * The ability for users to rename files differently in different branches means that our chains of renames will not necessarily be linear but may branch out. - """)[1:])) - f.write("\n") + """)[1:]).encode()) + f.write(b"\n") # Equivalence classes for names, so if folks only want to keep a # certain set of paths, they know the old names they want to include # too. - with open(os.path.join(reportdir, "renames.txt"), 'bw') as f: + with open(os.path.join(reportdir, b"renames.txt"), 'bw') as f: seen = set() for pathname,equiv_group in sorted(stats['equivalence'].items(), key=lambda x:(x[1], x[0])): if equiv_group in seen: continue seen.add(equiv_group) - f.write("{} ->\n ".format(decode(equiv_group[0])) + + f.write(("{} ->\n ".format(decode(equiv_group[0])) + "\n ".join(decode(x) for x in equiv_group[1:]) + - "\n") + "\n").encode()) # List directories in reverse sorted order of unpacked size - with open(os.path.join(reportdir, "directories-deleted-sizes.txt"), 'bw') as f: - f.write("=== %s ===\n" % _("Deleted directories by reverse size")) - f.write(_("Format: unpacked size, packed size, date deleted, directory name\n")) + with open(os.path.join(reportdir, b"directories-deleted-sizes.txt"), 'bw') as f: + msg = "=== %s ===\n" % _("Deleted directories by reverse size") + f.write(msg.encode()) + msg = _("Format: unpacked size, packed size, date deleted, directory name\n") + f.write(msg.encode()) for dirname, size in sorted(dir_size['packed'].items(), key=lambda x:(x[1],x[0]), reverse=True): if (dir_deleted_data[dirname]): - f.write(" {:10d} {:10d} {:10s} {}\n" - .format(dir_size['unpacked'][dirname], - size, - datestr(dir_deleted_data[dirname]), - dirname or _(''))) + f.write(b" %10d %10d %-10s %s\n" % (dir_size['unpacked'][dirname], + size, + datestr(dir_deleted_data[dirname]), + dirname or _('').encode())) - with open(os.path.join(reportdir, "directories-all-sizes.txt"), 'bw') as f: - f.write("=== %s ===\n" % _("All directories by reverse size")) - f.write(_("Format: unpacked size, packed size, date deleted, directory name\n")) + with open(os.path.join(reportdir, b"directories-all-sizes.txt"), 'bw') as f: + f.write(("=== %s ===\n" % _("All directories by reverse size")).encode()) + msg = _("Format: unpacked size, packed size, date deleted, directory name\n") + f.write(msg.encode()) for dirname, size in sorted(dir_size['packed'].items(), key=lambda x:(x[1],x[0]), reverse=True): - f.write(" {:10d} {:10d} {:10s} {}\n" - .format(dir_size['unpacked'][dirname], - size, - datestr(dir_deleted_data[dirname]), - dirname or _(""))) + f.write(b" %10d %10d %-10s %s\n" % (dir_size['unpacked'][dirname], + size, + datestr(dir_deleted_data[dirname]), + dirname or _("").encode())) # List extensions in reverse sorted order of unpacked size - with open(os.path.join(reportdir, "extensions-deleted-sizes.txt"), 'bw') as f: - f.write("=== %s ===\n" % _("Deleted extensions by reverse size")) - f.write(_("Format: unpacked size, packed size, date deleted, extension name\n")) + with open(os.path.join(reportdir, b"extensions-deleted-sizes.txt"), 'bw') as f: + msg = "=== %s ===\n" % _("Deleted extensions by reverse size") + f.write(msg.encode()) + msg = _("Format: unpacked size, packed size, date deleted, extension name\n") + f.write(msg.encode()) for extname, size in sorted(ext_size['packed'].items(), key=lambda x:(x[1],x[0]), reverse=True): if (ext_deleted_data[extname]): - f.write(" {:10d} {:10d} {:10s} {}\n" - .format(ext_size['unpacked'][extname], - size, - datestr(ext_deleted_data[extname]), - extname or _(''))) + f.write(b" %10d %10d %-10s %s\n" % (ext_size['unpacked'][extname], + size, + datestr(ext_deleted_data[extname]), + extname or _('').encode())) - with open(os.path.join(reportdir, "extensions-all-sizes.txt"), 'bw') as f: - f.write("=== %s ===\n" % _("All extensions by reverse size")) - f.write(_("Format: unpacked size, packed size, date deleted, extension name\n")) + with open(os.path.join(reportdir, b"extensions-all-sizes.txt"), 'bw') as f: + f.write(("=== %s ===\n" % _("All extensions by reverse size")).encode()) + msg = _("Format: unpacked size, packed size, date deleted, extension name\n") + f.write(msg.encode()) for extname, size in sorted(ext_size['packed'].items(), key=lambda x:(x[1],x[0]), reverse=True): - f.write(" {:10d} {:10d} {:10s} {}\n" - .format(ext_size['unpacked'][extname], - size, - datestr(ext_deleted_data[extname]), - extname or _(''))) + f.write(b" %10d %10d %-10s %s\n" % (ext_size['unpacked'][extname], + size, + datestr(ext_deleted_data[extname]), + extname or _('').encode())) # List files in reverse sorted order of unpacked size - with open(os.path.join(reportdir, "path-deleted-sizes.txt"), 'bw') as f: - f.write("=== %s ===\n" % _("Deleted paths by reverse accumulated size")) - f.write(_("Format: unpacked size, packed size, date deleted, path name(s)\n")) + with open(os.path.join(reportdir, b"path-deleted-sizes.txt"), 'bw') as f: + msg = "=== %s ===\n" % _("Deleted paths by reverse accumulated size") + f.write(msg.encode()) + msg = _("Format: unpacked size, packed size, date deleted, path name(s)\n") + f.write(msg.encode()) for pathname, size in sorted(path_size['packed'].items(), key=lambda x:(x[1],x[0]), reverse=True): when = stats['file_deletions'].get(pathname, None) if when: - f.write(" {:10d} {:10d} {:10s} {}\n" - .format(path_size['unpacked'][pathname], - size, - datestr(when), - pathname)) + f.write(b" %10d %10d %-10s %s\n" % (path_size['unpacked'][pathname], + size, + datestr(when), + pathname)) - with open(os.path.join(reportdir, "path-all-sizes.txt"), 'bw') as f: - f.write("=== %s ===\n" % _("All paths by reverse accumulated size")) - f.write(_("Format: unpacked size, packed size, date deleted, pathectory name\n")) + with open(os.path.join(reportdir, b"path-all-sizes.txt"), 'bw') as f: + msg = "=== %s ===\n" % _("All paths by reverse accumulated size") + f.write(msg.encode()) + msg = _("Format: unpacked size, packed size, date deleted, pathectory name\n") + f.write(msg.encode()) for pathname, size in sorted(path_size['packed'].items(), key=lambda x:(x[1],x[0]), reverse=True): when = stats['file_deletions'].get(pathname, None) - f.write(" {:10d} {:10d} {:10s} {}\n" - .format(path_size['unpacked'][pathname], - size, - datestr(when), - pathname)) + f.write(b" %10d %10d %-10s %s\n" % (path_size['unpacked'][pathname], + size, + datestr(when), + pathname)) # List of filenames and sizes in descending order - with open(os.path.join(reportdir, "blob-shas-and-paths.txt"), 'bw') as f: - f.write("=== %s ===\n" % _("Files by sha and associated pathnames in reverse size")) - f.write(_("Format: sha, unpacked size, packed size, filename(s) object stored as\n")) + with open(os.path.join(reportdir, b"blob-shas-and-paths.txt"), 'bw') as f: + f.write(("=== %s ===\n" % _("Files by sha and associated pathnames in reverse size")).encode()) + f.write(_("Format: sha, unpacked size, packed size, filename(s) object stored as\n").encode()) for sha, size in sorted(stats['packed_size'].items(), key=lambda x:(x[1],x[0]), reverse=True): if sha not in stats['names']: @@ -2541,21 +2553,21 @@ class RepoAnalyze(object): if len(names_with_sha) == 1: names_with_sha = names_with_sha.pop() else: - names_with_sha = sorted(list(names_with_sha)) - f.write(" {} {:10d} {:10d} {}\n".format(sha, - stats['unpacked_size'][sha], - size, - names_with_sha)) + names_with_sha = b'[' + b', '.join(sorted(names_with_sha)) + b']' + f.write(b" %s %10d %10d %s\n" % (sha, + stats['unpacked_size'][sha], + size, + names_with_sha)) @staticmethod def run(args): - git_dir = GitUtils.determine_git_dir('.') + git_dir = GitUtils.determine_git_dir(b'.') # Create the report directory as necessary - results_tmp_dir = os.path.join(git_dir, 'filter-repo') + results_tmp_dir = os.path.join(git_dir, b'filter-repo') if not os.path.isdir(results_tmp_dir): os.mkdir(results_tmp_dir) - reportdir = os.path.join(results_tmp_dir, "analysis") + reportdir = os.path.join(results_tmp_dir, b"analysis") if not args.force and os.path.isdir(reportdir): shutil.rmtree(reportdir) os.mkdir(reportdir) @@ -2693,7 +2705,7 @@ class RepoFilter(object): # Do sanity checks from the correct directory tmp_dir = self.results_tmp_dir(create_if_missing=False) if not self._args.force and \ - not os.path.isfile(os.path.join(tmp_dir, 'already_ran')): + not os.path.isfile(os.path.join(tmp_dir, b'already_ran')): cwd = os.getcwd() os.chdir(target_working_dir) RepoFilter.sanity_check(self._orig_refs, is_bare) @@ -2710,27 +2722,27 @@ class RepoFilter(object): # Make sure repo is fully packed, just like a fresh clone would be output = subprocess.check_output('git count-objects -v'.split()) - stats = dict(x.split(': ') for x in output.splitlines()) - num_packs = int(stats['packs']) - if stats['count'] != '0' or num_packs > 1: + stats = dict(x.split(b': ') for x in output.splitlines()) + num_packs = int(stats[b'packs']) + if stats[b'count'] != b'0' or num_packs > 1: abort(_("expected freshly packed repo")) # Make sure there is precisely one remote, named "origin"...or that this # is a new bare repo with no packs and no remotes output = subprocess.check_output('git remote'.split()).strip() - if not (output == "origin" or (num_packs == 0 and not output)): + if not (output == b"origin" or (num_packs == 0 and not output)): abort(_("expected one remote, origin")) # Avoid letting people running with weird setups and overwriting GIT_DIR # elsewhere - git_dir = GitUtils.determine_git_dir('.') - if is_bare and git_dir != '.': + git_dir = GitUtils.determine_git_dir(b'.') + if is_bare and git_dir != b'.': abort(_("GIT_DIR must be .")) - elif not is_bare and git_dir != '.git': + elif not is_bare and git_dir != b'.git': abort(_("GIT_DIR must be .git")) # Make sure that all reflogs have precisely one entry - reflog_dir=os.path.join(git_dir, 'logs') + reflog_dir=os.path.join(git_dir, b'logs') for root, dirs, files in os.walk(reflog_dir): for filename in files: pathname = os.path.join(root, filename) @@ -2741,7 +2753,7 @@ class RepoFilter(object): decode(shortpath)) # Make sure there are no stashed changes - if 'refs/stash' in refs: + if b'refs/stash' in refs: abort(_("has stashed changes")) # Do extra checks in non-bare repos @@ -2756,9 +2768,9 @@ class RepoFilter(object): # Avoid unpushed changes for refname, rev in refs.items(): - if not refname.startswith('refs/heads/'): + if not refname.startswith(b'refs/heads/'): continue - origin_ref = refname.replace('refs/heads/', 'refs/remotes/origin/') + origin_ref = refname.replace(b'refs/heads/', b'refs/remotes/origin/') if origin_ref not in refs: abort(_('%s exists, but %s not found') % (decode(refname), decode(origin_ref))) @@ -2776,13 +2788,13 @@ class RepoFilter(object): def tweak_commit(self, commit): def filename_matches(path_expression, pathname): - if path_expression == '': + if path_expression == b'': return True n = len(path_expression) if (pathname.startswith(path_expression) and - (path_expression[n-1:n] == '/' or + (path_expression[n-1:n] == b'/' or len(pathname) == n or - pathname[n:n+1] == '/')): + pathname[n:n+1] == b'/')): return True return False @@ -2798,7 +2810,7 @@ class RepoFilter(object): if match_type == 'regex' and path_exp.search(pathname): wanted = True elif mod_type == 'rename': - old_exp, new_exp = path_exp.split(':') + old_exp, new_exp = path_exp.split(b':') assert match_type in ('prefix',) if match_type == 'prefix' and pathname.startswith(old_exp): pathname = pathname.replace(old_exp, new_exp, 1) @@ -2866,15 +2878,15 @@ class RepoFilter(object): # in sync with the original with any changes, and then decides # they want to rewrite history to only have one of the two files) colliding_change = new_file_changes[change.filename] - if change.type == 'D': + if change.type == b'D': # We can just throw this one away and keep the other continue - elif change.type == 'M' and ( + elif change.type == b'M' and ( change.mode == colliding_change.mode and change.blob_id == colliding_change.blob_id): # The two are identical, so we can throw this one away and keep other continue - elif new_file_changes[change.filename].type != 'D': + elif new_file_changes[change.filename].type != b'D': raise SystemExit(_("File renaming caused colliding pathnames!\n") + _(" Commit: {}\n").format(commit.original_id) + _(" Filename: {}").format(change.filename)) @@ -2883,8 +2895,8 @@ class RepoFilter(object): @staticmethod def do_tag_rename(rename_pair, tagname): - old, new = rename_pair.split(':', 1) - old, new = 'refs/tags/'+old, 'refs/tags/'+new + old, new = rename_pair.split(b':', 1) + old, new = b'refs/tags/'+old, b'refs/tags/'+new if tagname.startswith(old): return tagname.replace(old, new, 1) return tagname @@ -2895,7 +2907,7 @@ class RepoFilter(object): tag.message = self._message_callback(tag.message) # Tweak the tag name according to callbacks - tag_prefix = 'refs/tags/' + tag_prefix = b'refs/tags/' fullref = tag_prefix+tag.ref if self._args.tag_rename: fullref = RepoFilter.do_tag_rename(self._args.tag_rename, fullref) @@ -2923,9 +2935,9 @@ class RepoFilter(object): reset.ref = self._refname_callback(reset.ref) def results_tmp_dir(self, create_if_missing=True): - working_dir = self._args.target or self._args.source or '.' + working_dir = self._args.target or self._args.source or b'.' git_dir = GitUtils.determine_git_dir(working_dir) - d = os.path.join(git_dir, 'filter-repo') + d = os.path.join(git_dir, b'filter-repo') if create_if_missing and not os.path.isdir(d): os.mkdir(d) return d @@ -2970,7 +2982,7 @@ class RepoFilter(object): self._input = self._fep.stdout if self._args.dry_run or self._args.debug: self._fe_orig = os.path.join(self.results_tmp_dir(), - 'fast-export.original') + b'fast-export.original') output = open(self._fe_orig, 'bw') self._input = InputFileBackup(self._input, output) if self._args.debug: @@ -2989,7 +3001,7 @@ class RepoFilter(object): self._import_pipes = (self._fip.stdin, self._fip.stdout) if self._args.dry_run or self._args.debug: self._fe_filt = os.path.join(self.results_tmp_dir(), - 'fast-export.filtered') + b'fast-export.filtered') self._output = open(self._fe_filt, 'bw') else: self._output = self._fip.stdin @@ -3003,7 +3015,7 @@ class RepoFilter(object): if self._args.dry_run: return refs_to_migrate = set(x for x in self._orig_refs - if x.startswith('refs/remotes/origin/')) + if x.startswith(b'refs/remotes/origin/')) if not refs_to_migrate: return if self._args.debug: @@ -3013,14 +3025,14 @@ class RepoFilter(object): stdin=subprocess.PIPE, cwd=target_working_dir) for ref in refs_to_migrate: - if ref == 'refs/remotes/origin/HEAD': - p.stdin.write('delete {} {}\n'.format(ref, self._orig_refs[ref])) + if ref == b'refs/remotes/origin/HEAD': + p.stdin.write(b'delete %s %s\n' % (ref, self._orig_refs[ref])) del self._orig_refs[ref] continue - newref = ref.replace('refs/remotes/origin/', 'refs/heads/') + newref = ref.replace(b'refs/remotes/origin/', b'refs/heads/') if newref not in self._orig_refs: - p.stdin.write('create {} {}\n'.format(newref, self._orig_refs[ref])) - p.stdin.write('delete {} {}\n'.format(ref, self._orig_refs[ref])) + p.stdin.write(b'create %s %s\n' % (newref, self._orig_refs[ref])) + p.stdin.write(b'delete %s %s\n' % (ref, self._orig_refs[ref])) self._orig_refs[newref] = self._orig_refs[ref] del self._orig_refs[ref] p.stdin.close() @@ -3115,11 +3127,11 @@ class RepoFilter(object): if refs_to_nuke: if self._args.debug: print("[DEBUG] Deleting the following refs:\n "+ - decode("\n ".join(refs_to_nuke))) + decode(b"\n ".join(refs_to_nuke))) p = subprocess.Popen('git update-ref --stdin'.split(), stdin=subprocess.PIPE, cwd=target_working_dir) - p.stdin.write(''.join(["option no-deref\ndelete {}\n".format(x) + p.stdin.write(b''.join([b"option no-deref\ndelete %s\n" % x for x in refs_to_nuke])) p.stdin.close() if p.wait(): diff --git a/t/t9390-filter-repo.sh b/t/t9390-filter-repo.sh index 52221d1..8a674ab 100755 --- a/t/t9390-filter-repo.sh +++ b/t/t9390-filter-repo.sh @@ -450,15 +450,15 @@ test_expect_success C_LOCALE_OUTPUT '--analyze' ' head -n 9 README >actual && test_cmp expect actual && - cat | tr Q "\047" >expect <<-\EOF && + cat >expect <<-\EOF && === Files by sha and associated pathnames in reverse size === Format: sha, unpacked size, packed size, filename(s) object stored as a89c82a2d4b713a125a4323d25adda062cc0013d 44 48 numbers/medium.num f00c965d8307308469e537302baa73048488f162 21 37 numbers/small.num 2aa69a2a708eed00cb390e30f6bcc3eed773f390 20 36 whatever - 51b95456de9274c9a95f756742808dfd480b9b35 13 29 [QcapriciousQ, QfickleQ, QmercurialQ] - 732c85a1b3d7ce40ec8f78fd9ffea32e9f45fae0 5 20 [Qsequence/knowQ, Qwords/knowQ] - 34b6a0c9d02cb6ef7f409f248c0c1224ce9dd373 5 20 [Qsequence/toQ, Qwords/toQ] + 51b95456de9274c9a95f756742808dfd480b9b35 13 29 [capricious, fickle, mercurial] + 732c85a1b3d7ce40ec8f78fd9ffea32e9f45fae0 5 20 [sequence/know, words/know] + 34b6a0c9d02cb6ef7f409f248c0c1224ce9dd373 5 20 [sequence/to, words/to] 7ecb56eb3fa3fa6f19dd48bca9f971950b119ede 3 18 words/know EOF test_cmp expect blob-shas-and-paths.txt && @@ -795,7 +795,7 @@ test_expect_success 'incremental import' ' original=$(git rev-parse master) && git fast-export --reference-excluded-parents master~2..master \ - | git filter-repo --stdin --refname-callback "return \"develop\"" && + | git filter-repo --stdin --refname-callback "return b\"develop\"" && test "$(git rev-parse develop)" = "$original" ) ' diff --git a/t/t9391/commit_info.py b/t/t9391/commit_info.py index a0d34f3..01fd725 100755 --- a/t/t9391/commit_info.py +++ b/t/t9391/commit_info.py @@ -13,12 +13,12 @@ import git_filter_repo as fr def change_up_them_commits(commit): # Change the commit author - if commit.author_name == "Copy N. Paste": - commit.author_name = "Ima L. Oser" - commit.author_email = "aloser@my.corp" + if commit.author_name == b"Copy N. Paste": + commit.author_name = b"Ima L. Oser" + commit.author_email = b"aloser@my.corp" # Fix the author email - commit.author_email = re.sub("@my.crp", "@my.corp", commit.author_email) + commit.author_email = re.sub(b"@my.crp", b"@my.corp", commit.author_email) # Fix the committer date (bad timezone conversion in initial import) oldtime = fr.string_to_date(commit.committer_date) @@ -26,7 +26,7 @@ def change_up_them_commits(commit): commit.committer_date = fr.date_to_string(newtime) # Fix the commit message - commit.message = re.sub("Marketing is staffed with pansies", "", + commit.message = re.sub(b"Marketing is staffed with pansies", b"", commit.message) args = fr.FilteringOptions.parse_args(['--force']) diff --git a/t/t9391/create_fast_export_output.py b/t/t9391/create_fast_export_output.py index e2ef13c..1eb0a3d 100755 --- a/t/t9391/create_fast_export_output.py +++ b/t/t9391/create_fast_export_output.py @@ -23,82 +23,82 @@ out.importer_only() output = out._output -world = Blob("Hello") +world = Blob(b"Hello") world.dump(output) -bar = Blob("foo\n") +bar = Blob(b"foo\n") bar.dump(output) -master = Reset("refs/heads/master") +master = Reset(b"refs/heads/master") master.dump(output) -changes = [FileChanges('M', 'world', world.id, mode="100644"), - FileChanges('M', 'bar', bar.id, mode="100644")] +changes = [FileChanges(b'M', b'world', world.id, mode=b"100644"), + FileChanges(b'M', b'bar', bar.id, mode=b"100644")] when = datetime(year=2005, month=4, day=7, hour=15, minute=16, second=10, - tzinfo=FixedTimeZone("-0700")) + tzinfo=FixedTimeZone(b"-0700")) when_string = fr.date_to_string(when) -commit1 = Commit("refs/heads/master", - "A U Thor", "au@thor.email", when_string, - "Com M. Iter", "comm@iter.email", when_string, - "My first commit! Wooot!\n\nLonger description", +commit1 = Commit(b"refs/heads/master", + b"A U Thor", b"au@thor.email", when_string, + b"Com M. Iter", b"comm@iter.email", when_string, + b"My first commit! Wooot!\n\nLonger description", changes, parents = []) commit1.dump(output) -world = Blob("Hello\nHi") +world = Blob(b"Hello\nHi") world.dump(output) -world_link = Blob("world") +world_link = Blob(b"world") world_link.dump(output) -changes = [FileChanges('M', 'world', world.id, mode="100644"), - FileChanges('M', 'planet', world_link.id, mode="120000")] +changes = [FileChanges(b'M', b'world', world.id, mode=b"100644"), + FileChanges(b'M', b'planet', world_link.id, mode=b"120000")] when += timedelta(days=3, hours=4, minutes=6) when_string = fr.date_to_string(when) -commit2 = Commit("refs/heads/master", - "A U Thor", "au@thor.email", when_string, - "Com M. Iter", "comm@iter.email", when_string, - "Make a symlink to world called planet, modify world", +commit2 = Commit(b"refs/heads/master", + b"A U Thor", b"au@thor.email", when_string, + b"Com M. Iter", b"comm@iter.email", when_string, + b"Make a symlink to world called planet, modify world", changes, parents = [commit1.id]) commit2.dump(output) -script = Blob("#!/bin/sh\n\necho Hello") +script = Blob(b"#!/bin/sh\n\necho Hello") script.dump(output) -changes = [FileChanges('M', 'runme', script.id, mode="100755"), - FileChanges('D', 'bar')] -when_string = "1234567890 -0700" -commit3 = Commit("refs/heads/master", - "A U Thor", "au@thor.email", when_string, - "Com M. Iter", "comm@iter.email", when_string, - "Add runme script, remove bar", +changes = [FileChanges(b'M', b'runme', script.id, mode=b"100755"), + FileChanges(b'D', b'bar')] +when_string = b"1234567890 -0700" +commit3 = Commit(b"refs/heads/master", + b"A U Thor", b"au@thor.email", when_string, + b"Com M. Iter", b"comm@iter.email", when_string, + b"Add runme script, remove bar", changes, parents = [commit2.id]) commit3.dump(output) -progress = Progress("Done with the master branch now...") +progress = Progress(b"Done with the master branch now...") progress.dump(output) checkpoint = Checkpoint() checkpoint.dump(output) -devel = Reset("refs/heads/devel", commit1.id) +devel = Reset(b"refs/heads/devel", commit1.id) devel.dump(output) -world = Blob("Hello\nGoodbye") +world = Blob(b"Hello\nGoodbye") world.dump(output) -changes = [FileChanges('M', 'world', world.id, mode="100644")] -when = datetime(2006, 8, 17, tzinfo=FixedTimeZone("+0200")) +changes = [FileChanges(b'M', b'world', world.id, mode=b"100644")] +when = datetime(2006, 8, 17, tzinfo=FixedTimeZone(b"+0200")) when_string = fr.date_to_string(when) -commit4 = Commit("refs/heads/devel", - "A U Thor", "au@thor.email", when_string, - "Com M. Iter", "comm@iter.email", when_string, - "Modify world", +commit4 = Commit(b"refs/heads/devel", + b"A U Thor", b"au@thor.email", when_string, + b"Com M. Iter", b"comm@iter.email", when_string, + b"Modify world", changes, parents = [commit1.id]) commit4.dump(output) -world = Blob("Hello\nHi\nGoodbye") +world = Blob(b"Hello\nHi\nGoodbye") world.dump(output) when = fr.string_to_date(commit3.author_date) + timedelta(days=47) when_string = fr.date_to_string(when) @@ -106,22 +106,22 @@ when_string = fr.date_to_string(when) # to the first parent. Thus, despite the fact that runme and planet have # not changed and bar was not modified in the devel side, we have to list them # all anyway. -changes = [FileChanges('M', 'world', world.id, mode="100644"), - FileChanges('D', 'bar'), - FileChanges('M', 'runme', script.id, mode="100755"), - FileChanges('M', 'planet', world_link.id, mode="120000")] +changes = [FileChanges(b'M', b'world', world.id, mode=b"100644"), + FileChanges(b'D', b'bar'), + FileChanges(b'M', b'runme', script.id, mode=b"100755"), + FileChanges(b'M', b'planet', world_link.id, mode=b"120000")] -commit5 = Commit("refs/heads/devel", - "A U Thor", "au@thor.email", when_string, - "Com M. Iter", "comm@iter.email", when_string, - "Merge branch 'master'\n", +commit5 = Commit(b"refs/heads/devel", + b"A U Thor", b"au@thor.email", when_string, + b"Com M. Iter", b"comm@iter.email", when_string, + b"Merge branch 'master'\n", changes, parents = [commit4.id, commit3.id]) commit5.dump(output) -mytag = Tag("refs/tags/v1.0", commit5.id, - "His R. Highness", "royalty@my.kingdom", when_string, - "I bequeath to my peons this royal software") +mytag = Tag(b"refs/tags/v1.0", commit5.id, + b"His R. Highness", b"royalty@my.kingdom", when_string, + b"I bequeath to my peons this royal software") mytag.dump(output) out.finish() diff --git a/t/t9391/file_filter.py b/t/t9391/file_filter.py index 8540b7d..c3683fc 100755 --- a/t/t9391/file_filter.py +++ b/t/t9391/file_filter.py @@ -15,14 +15,14 @@ import sys import git_filter_repo as fr def drop_file_by_contents(blob): - bad_file_contents = 'The launch code is 1-2-3-4.' + bad_file_contents = b'The launch code is 1-2-3-4.' if blob.data == bad_file_contents: blob.skip() def drop_files_by_name(commit): new_file_changes = [] for change in commit.file_changes: - if not change.filename.endswith('.doc'): + if not change.filename.endswith(b'.doc'): new_file_changes.append(change) commit.file_changes = new_file_changes diff --git a/t/t9391/rename-master-to-develop.py b/t/t9391/rename-master-to-develop.py index 7a922d0..1acfef8 100755 --- a/t/t9391/rename-master-to-develop.py +++ b/t/t9391/rename-master-to-develop.py @@ -14,8 +14,8 @@ not try to handle any such special cases. import git_filter_repo as fr def my_commit_callback(commit): - if commit.branch == "refs/heads/master": - commit.branch = "refs/heads/develop" + if commit.branch == b"refs/heads/master": + commit.branch = b"refs/heads/develop" args = fr.FilteringOptions.default_options() args.force = True diff --git a/t/t9391/splice_repos.py b/t/t9391/splice_repos.py index 133044e..5993436 100755 --- a/t/t9391/splice_repos.py +++ b/t/t9391/splice_repos.py @@ -29,11 +29,11 @@ class InterleaveRepositories: def hold_commit(self, commit): commit.skip(new_id = commit.id) - letter = re.match('Commit (.)', commit.message).group(1) + letter = re.match(b'Commit (.)', commit.message).group(1) self.commit_map[letter] = commit def weave_commit(self, commit): - letter = re.match('Commit (.)', commit.message).group(1) + letter = re.match(b'Commit (.)', commit.message).group(1) prev_letter = bytes([ord(letter)-1]) # Splice in any extra commits needed @@ -53,10 +53,10 @@ class InterleaveRepositories: fr.record_id_rename(new_commit.id, commit.id) def run(self): - blob = fr.Blob('public gpg key contents') - tag = fr.Tag('gpg-pubkey', blob.id, - 'Ima Tagger', 'ima@tagg.er', '1136199845 +0300', - 'Very important explanation and stuff') + blob = fr.Blob(b'public gpg key contents') + tag = fr.Tag(b'gpg-pubkey', blob.id, + b'Ima Tagger', b'ima@tagg.er', b'1136199845 +0300', + b'Very important explanation and stuff') args = fr.FilteringOptions.parse_args(['--target', self.output_dir]) out = fr.RepoFilter(args) diff --git a/t/t9391/strip-cvs-keywords.py b/t/t9391/strip-cvs-keywords.py index ccd3c8d..ae7cda0 100755 --- a/t/t9391/strip-cvs-keywords.py +++ b/t/t9391/strip-cvs-keywords.py @@ -18,8 +18,8 @@ def strip_cvs_keywords(blob): # FIXME: Should first check if blob is a text file to avoid ruining # binaries. Could use python.magic here, or just output blob.data to # the unix 'file' command - pattern = r'\$(Id|Date|Source|Header|CVSHeader|Author|Revision):.*\$' - replacement = r'$\1$' + pattern = br'\$(Id|Date|Source|Header|CVSHeader|Author|Revision):.*\$' + replacement = br'$\1$' blob.data = re.sub(pattern, replacement, blob.data) args = fr.FilteringOptions.parse_args(['--force']) diff --git a/t/t9391/unusual.py b/t/t9391/unusual.py index da0cf89..190f82b 100755 --- a/t/t9391/unusual.py +++ b/t/t9391/unusual.py @@ -21,7 +21,7 @@ import textwrap import git_filter_repo as fr def handle_progress(progress): - print("Decipher this: "+bytes(reversed(progress.message))) + print(b"Decipher this: "+bytes(reversed(progress.message))) def handle_checkpoint(checkpoint_object): # Flip a coin; see if we want to pass the checkpoint through. @@ -44,8 +44,8 @@ def track_everything(obj): # projects, I'm just verifying an invariant of the current code. assert fr._IDS._reverse_translation[obj.id] == [obj.id - 1] -mystr = 'This is the contents of the blob' -compare = "Blob:\n blob\n mark :1\n data {}\n {}".format(len(mystr), mystr) +mystr = b'This is the contents of the blob' +compare = b"Blob:\n blob\n mark :1\n data %d\n %s" % (len(mystr), mystr) # Next line's only purpose is testing code coverage of something that helps # debugging git-filter-repo; it is NOT something external folks should depend # upon. @@ -102,14 +102,14 @@ stream = io.BytesIO(textwrap.dedent(''' from :3 M 100644 :1 salutation - '''[1:])) + '''[1:]).encode()) counts = collections.Counter() def look_for_reset(obj): print("Processing {}".format(obj)) counts[type(obj)] += 1 if type(obj) == fr.Reset: - assert obj.ref == 'refs/heads/B' + assert obj.ref == b'refs/heads/B' # Use all kinds of internals that external scripts should NOT use and which # are likely to break in the future, just to verify a few invariants... diff --git a/t/t9392-python-callback.sh b/t/t9392-python-callback.sh index 983879e..27c338c 100755 --- a/t/t9392-python-callback.sh +++ b/t/t9392-python-callback.sh @@ -51,7 +51,7 @@ test_expect_success '--filename-callback' ' setup filename-callback && ( cd filename-callback && - git filter-repo --filename-callback "return None if filename.endswith(\".doc\") else \"src/\"+filename" && + git filter-repo --filename-callback "return None if filename.endswith(b\".doc\") else b\"src/\"+filename" && git log --format=%n --name-only | sort | uniq | grep -v ^$ > f && ! grep file.doc f && COMPARE=$(wc -l log-messages && grep TLDR:...... log-messages >modified-messages && test_line_count = 6 modified-messages @@ -75,7 +75,7 @@ test_expect_success '--name-callback' ' setup name-callback && ( cd name-callback && - git filter-repo --name-callback "return name.replace(\"N.\", \"And\")" && + git filter-repo --name-callback "return name.replace(b\"N.\", b\"And\")" && git log --format=%an >log-person-names && grep Copy.And.Paste log-person-names ) @@ -85,7 +85,7 @@ test_expect_success '--email-callback' ' setup email-callback && ( cd email-callback && - git filter-repo --email-callback "return email.replace(\".com\", \".org\")" && + git filter-repo --email-callback "return email.replace(b\".com\", b\".org\")" && git log --format=%ae%n%ce >log-emails && ! grep .com log-emails && grep .org log-emails @@ -98,7 +98,7 @@ test_expect_success '--refname-callback' ' cd refname-callback && git filter-repo --refname-callback " dir,path = os.path.split(refname) - return dir+\"/prefix-\"+path" && + return dir+b\"/prefix-\"+path" && git show-ref | grep refs/heads/prefix-master && git show-ref | grep refs/tags/prefix-v1.0 && git show-ref | grep refs/tags/prefix-v2.0 @@ -110,7 +110,7 @@ test_expect_success '--refname-callback sanity check' ' ( cd refname-sanity-check && - test_must_fail git filter-repo --refname-callback "return re.sub(\"tags\", \"other-tags\", refname)" 2>../err && + test_must_fail git filter-repo --refname-callback "return re.sub(b\"tags\", b\"other-tags\", refname)" 2>../err && test_i18ngrep "fast-import requires tags to be in refs/tags/ namespace" ../err && rm ../err ) @@ -138,7 +138,7 @@ test_expect_success '--commit-callback' ' commit.committer_email = commit.author_email commit.committer_date = commit.author_date for change in commit.file_changes: - change.mode = \"100755\" + change.mode = b\"100755\" " && git log --format=%ae%n%ce >log-emails && ! grep committer@example.com log-emails && @@ -153,8 +153,8 @@ test_expect_success '--tag-callback' ' ( cd tag-callback && git filter-repo --tag-callback " - tag.tagger_name = \"Dr. \"+tag.tagger_name - tag.message = \"Awesome sauce \"+tag.message + tag.tagger_name = b\"Dr. \"+tag.tagger_name + tag.message = b\"Awesome sauce \"+tag.message " && git cat-file -p v2.0 | grep ^tagger.Dr\\. && git cat-file -p v2.0 | grep ^Awesome.sauce.Super @@ -175,7 +175,7 @@ test_expect_success 'callback has return statement sanity check' ' ( cd callback_return_sanity && - test_must_fail git filter-repo --filename-callback "filename + \".txt\"" 2>../err&& + test_must_fail git filter-repo --filename-callback "filename + b\".txt\"" 2>../err&& test_i18ngrep "Error: --filename-callback should have a return statement" ../err && rm ../err )