From e1596776c98bdc9bdb96287275f956f57286455c Mon Sep 17 00:00:00 2001 From: Yufan Lou <2263580+louy2@users.noreply.github.com> Date: Tue, 14 Mar 2023 01:11:14 -0400 Subject: [PATCH] Fix "Passed but got" error on CJK file names filter-repo callback passes unicode filename as utf_8 bytes but `git check-ignore` prints unicode filename as quoted octal escaped utf_8 bytes failing the `name != pathname` check on CJK filenames `.decode('unicode_escape')` decodes latin-1 bytes with escaped unicode so it decodes the escaped bytes, but into a latin-1 str, therefore `.encode('latin_1')` recovers the original bytes, which is utf_8 and is comparable to the filename passed by filter-repo callback Signed-off-by: Yufan Lou <2263580+louy2@users.noreply.github.com> --- contrib/filter-repo-demos/clean-ignore | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/contrib/filter-repo-demos/clean-ignore b/contrib/filter-repo-demos/clean-ignore index 06823ee..852e778 100755 --- a/contrib/filter-repo-demos/clean-ignore +++ b/contrib/filter-repo-demos/clean-ignore @@ -50,6 +50,14 @@ class CheckIgnores: self.check_ignore_process.stdin.flush() result = self.check_ignore_process.stdout.readline().rstrip(b'\n') (rest, pathname) = result.split(b"\t") + # filter-repo callback passes unicode filename as utf_8 bytes + # but `git check-ignore` prints unicode filename as quoted octal escaped utf_8 bytes + # failing the `name != pathname` check on CJK filenames + # `.decode('unicode_escape')` decodes latin-1 bytes with escaped unicode + # so it decodes the escaped bytes, but into a latin-1 str, therefore + # `.encode('latin_1')` recovers the original bytes, which is utf_8 + # and is comparable to the filename passed by filter-repo callback + pathname = pathname.strip(b'"').decode('unicode_escape').encode('latin_1') if name != pathname: raise SystemExit("Error: Passed {} but got {}".format(name, pathname)) if rest == b'::':