Skip to content

Commit

Permalink
Fix a bug in unicode string dep inference. (cherrypick of #11879) (#1…
Browse files Browse the repository at this point in the history
…1881)

The dep extraction code calls print() to emit any imports and import-like strings it finds to stdout.
print() uses a default encoding, which in some cases might be ascii, causing it to fail on
non-ascii strings.

This change replaces the naive call to print() with explicitly encoding the string as utf8 and writing
the resulting raw bytes.

[ci skip-rust]

[ci skip-build-wheels]
  • Loading branch information
benjyw authored Apr 10, 2021
1 parent 7e8870e commit 66c94e2
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
# This regex is used to infer imports from strings, e.g.
# `importlib.import_module("example.subdir.Foo")`.
STRING_IMPORT_REGEX = re.compile(r"^([a-z_][a-z_\\d]*\\.){2,}[a-zA-Z_]\\w*$")
STRING_IMPORT_REGEX = re.compile(r"^([a-z_][a-z_\\d]*\\.){2,}[a-zA-Z_]\\w*$", re.UNICODE)
class AstVisitor(ast.NodeVisitor):
def __init__(self, package_parts):
Expand Down Expand Up @@ -126,9 +126,12 @@ def parse_file(filename):
explicit_imports.update(visitor.explicit_imports)
string_imports.update(visitor.string_imports)
print("\\n".join(sorted(explicit_imports)))
print("\\n--")
print("\\n".join(sorted(string_imports)))
# We have to be careful to set the encoding explicitly and write raw bytes ourselves.
# See below for where we explicitly decode.
buffer = sys.stdout if sys.version_info[0:2] == (2,7) else sys.stdout.buffer
buffer.write("\\n".join(sorted(explicit_imports)).encode("utf8"))
buffer.write(b"\\n--\\n")
buffer.write("\\n".join(sorted(string_imports)).encode("utf8"))
"""


Expand Down Expand Up @@ -178,7 +181,9 @@ async def parse_python_imports(request: ParsePythonImportsRequest) -> ParsedPyth
level=LogLevel.DEBUG,
),
)
explicit_imports, _, string_imports = process_result.stdout.decode().partition("--")
# See above for where we explicitly encoded as utf8. Even though utf8 is the
# default for decode(), we make that explicit here for emphasis.
explicit_imports, _, string_imports = process_result.stdout.decode("utf8").partition("--")
return ParsedPythonImports(
explicit_imports=FrozenOrderedSet(explicit_imports.strip().splitlines()),
string_imports=FrozenOrderedSet(string_imports.strip().splitlines()),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ def test_imports_from_strings(rule_runner: RuleRunner) -> None:
'a.b.c.d.e.f.g.Baz',
'a.b_c.d._bar',
'a.b2.c.D',
'a.b.c_狗',
# Invalid strings
'..a.b.c.d',
Expand Down Expand Up @@ -176,6 +177,7 @@ def test_imports_from_strings(rule_runner: RuleRunner) -> None:
"a.b.c.d.e.f.g.Baz",
"a.b_c.d._bar",
"a.b2.c.D",
"a.b.c_狗",
],
)

Expand All @@ -198,6 +200,7 @@ def test_gracefully_handle_no_sources(rule_runner: RuleRunner) -> None:
def test_works_with_python2(rule_runner: RuleRunner) -> None:
content = dedent(
"""\
# -*- coding: utf-8 -*-
print "Python 2 lives on."
import demo
Expand All @@ -208,6 +211,7 @@ def test_works_with_python2(rule_runner: RuleRunner) -> None:
importlib.import_module(b"dep.from.bytes")
importlib.import_module(u"dep.from.str")
importlib.import_module(u"dep.from.str_狗")
"""
)
assert_imports_parsed(
Expand All @@ -220,7 +224,7 @@ def test_works_with_python2(rule_runner: RuleRunner) -> None:
"pkg_resources",
"treat.as.a.regular.import.not.a.string.import",
],
expected_string=["dep.from.bytes", "dep.from.str"],
expected_string=["dep.from.bytes", "dep.from.str", "dep.from.str_狗"],
)


Expand Down

0 comments on commit 66c94e2

Please sign in to comment.