Fix a bug in unicode string dep inference. (cherrypick of #11879) (#1…

…1881) The dep extraction code calls print() to emit any imports and import-like strings it finds to stdout. print() uses a default encoding, which in some cases might be ascii, causing it to fail on non-ascii strings. This change replaces the naive call to print() with explicitly encoding the string as utf8 and writing the resulting raw bytes. [ci skip-rust] [ci skip-build-wheels]
pantsbuild · Apr 10, 2021 · 66c94e2 · 66c94e2
1 parent 7e8870e
commit 66c94e2
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 6 deletions.
diff --git a/src/python/pants/backend/python/dependency_inference/import_parser.py b/src/python/pants/backend/python/dependency_inference/import_parser.py
@@ -30,7 +30,7 @@
 
 # This regex is used to infer imports from strings, e.g.
 #  `importlib.import_module("example.subdir.Foo")`.
-STRING_IMPORT_REGEX = re.compile(r"^([a-z_][a-z_\\d]*\\.){2,}[a-zA-Z_]\\w*$")
+STRING_IMPORT_REGEX = re.compile(r"^([a-z_][a-z_\\d]*\\.){2,}[a-zA-Z_]\\w*$", re.UNICODE)
 
 class AstVisitor(ast.NodeVisitor):
     def __init__(self, package_parts):
@@ -126,9 +126,12 @@ def parse_file(filename):
         explicit_imports.update(visitor.explicit_imports)
         string_imports.update(visitor.string_imports)
 
-    print("\\n".join(sorted(explicit_imports)))
-    print("\\n--")
-    print("\\n".join(sorted(string_imports)))
+    # We have to be careful to set the encoding explicitly and write raw bytes ourselves.
+    # See below for where we explicitly decode.
+    buffer = sys.stdout if sys.version_info[0:2] == (2,7) else sys.stdout.buffer
+    buffer.write("\\n".join(sorted(explicit_imports)).encode("utf8"))
+    buffer.write(b"\\n--\\n")
+    buffer.write("\\n".join(sorted(string_imports)).encode("utf8"))
 """
 
 
@@ -178,7 +181,9 @@ async def parse_python_imports(request: ParsePythonImportsRequest) -> ParsedPyth
             level=LogLevel.DEBUG,
         ),
     )
-    explicit_imports, _, string_imports = process_result.stdout.decode().partition("--")
+    # See above for where we explicitly encoded as utf8. Even though utf8 is the
+    # default for decode(), we make that explicit here for emphasis.
+    explicit_imports, _, string_imports = process_result.stdout.decode("utf8").partition("--")
     return ParsedPythonImports(
         explicit_imports=FrozenOrderedSet(explicit_imports.strip().splitlines()),
         string_imports=FrozenOrderedSet(string_imports.strip().splitlines()),

diff --git a/src/python/pants/backend/python/dependency_inference/import_parser_test.py b/src/python/pants/backend/python/dependency_inference/import_parser_test.py
@@ -146,6 +146,7 @@ def test_imports_from_strings(rule_runner: RuleRunner) -> None:
             'a.b.c.d.e.f.g.Baz',
             'a.b_c.d._bar',
             'a.b2.c.D',
+            'a.b.c_狗',
 
             # Invalid strings
             '..a.b.c.d',
@@ -176,6 +177,7 @@ def test_imports_from_strings(rule_runner: RuleRunner) -> None:
             "a.b.c.d.e.f.g.Baz",
             "a.b_c.d._bar",
             "a.b2.c.D",
+            "a.b.c_狗",
         ],
     )
 
@@ -198,6 +200,7 @@ def test_gracefully_handle_no_sources(rule_runner: RuleRunner) -> None:
 def test_works_with_python2(rule_runner: RuleRunner) -> None:
     content = dedent(
         """\
+        # -*- coding: utf-8 -*-
         print "Python 2 lives on."
 
         import demo
@@ -208,6 +211,7 @@ def test_works_with_python2(rule_runner: RuleRunner) -> None:
 
         importlib.import_module(b"dep.from.bytes")
         importlib.import_module(u"dep.from.str")
+        importlib.import_module(u"dep.from.str_狗")
         """
     )
     assert_imports_parsed(
@@ -220,7 +224,7 @@ def test_works_with_python2(rule_runner: RuleRunner) -> None:
             "pkg_resources",
             "treat.as.a.regular.import.not.a.string.import",
         ],
-        expected_string=["dep.from.bytes", "dep.from.str"],
+        expected_string=["dep.from.bytes", "dep.from.str", "dep.from.str_狗"],
     )