From ad3c839263be4ff7798ed8ae7cf5cf7f89806138 Mon Sep 17 00:00:00 2001
From: Elijah Newren <newren@gmail.com>
Date: Fri, 26 Apr 2019 17:59:50 -0700
Subject: [PATCH] filter-repo (python3): handle conversion of glob to regex

python3 forces a couple issues for us with the conversion of globs to
regexes:
  * fnmatch.translate() will ONLY operate on unicode strings, not
    bytestrings.  Super lame.
  * newer versions of python3 modified the regex style used by
    fnmatch.translate() causing us to need extra logic to 'fixup'
    the regex into the form we want.
Split the code for translating the glob to a regex out into a separate
function which now houses more complicated logic to handle these extra
conditions.

Signed-off-by: Elijah Newren <newren@gmail.com>
---
 git-filter-repo | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/git-filter-repo b/git-filter-repo
index 5420acd..399a921 100755
--- a/git-filter-repo
+++ b/git-filter-repo
@@ -119,6 +119,33 @@ def decode(bytestr):
   'Try to convert bytestr to utf-8 for outputting as an error message.'
   return bytestr.decode('utf-8', 'backslashreplace')
 
+def glob_to_regex(glob_bytestr):
+  'Translate glob_bytestr into a regex on bytestrings'
+
+  # fnmatch.translate is idiotic and won't accept bytestrings
+  if (decode(glob_bytestr).encode() != glob_bytestr): # pragma: no cover
+    raise SystemExit(_("Error: Cannot handle glob %s").format(glob_bytestr))
+
+  # Create regex operating on string
+  regex = fnmatch.translate(decode(glob_bytestr))
+
+  # FIXME: This is an ugly hack...
+  # fnmatch.translate tries to do multi-line matching and wants the glob to
+  # match up to the end of the input, which isn't relevant for us, so we
+  # have to modify the regex.  fnmatch.translate has used different regex
+  # constructs to achieve this with different python versions, so we have
+  # to check for each of them and then fix it up.  It would be much better
+  # if fnmatch.translate could just take some flags to allow us to specify
+  # what we want rather than employing this hackery, but since it
+  # doesn't...
+  if regex.endswith(r'\Z(?ms)'): # pragma: no cover
+    regex = regex[0:-7]
+  elif regex.startswith(r'(?s:') and regex.endswith(r')\Z'): # pragma: no cover
+    regex = regex[4:-3]
+
+  # Finally, convert back to regex operating on bytestr
+  return regex.encode()
+
 class PathQuoting:
   _unescape = {'a': '\a',
                'b': '\b',
@@ -2037,9 +2064,7 @@ class FilteringOptions(object):
         if line.startswith('regex:'):
           regex = line[6:]
         elif line.startswith('glob:'):
-          regex = fnmatch.translate(line[5:])
-          if regex.endswith(r'\Z(?ms)'):
-            regex = regex[0:-7]
+          regex = glob_to_regex(line[5:])
         if regex:
           replace_regexes.append((re.compile(regex), replacement))
         else: