fix(api): keep proto contents in bytes for longer (#14446)

When we parse python protocols, we were doing it by (1) making it into a string with decode('utf-8') and then (2) passing the string ast.parse(). The problem with this is that decode('utf-8') does not apply "universal newlines", which means that the code object created by compiling the ast will have line numbers that are around twice what they should be under certain circumstances (windows machine, crlf file, mercury in the seventh house, etc). Then, when we go and display a nice error message about a syntax error or whatever, the user says "why is this error message pointing to a place past the end of my protocol". This should fix that by keeping the protocol contents in bytes form all the way through to passing ast.parse() a bytes that _has never been through str.decode('utf-8')_ which should preserve everything.
Opentrons · Feb 9, 2024 · 8dc6f79 · 8dc6f79
1 parent 277074a
commit 8dc6f79
Show file tree

Hide file tree

Showing 4 changed files with 46 additions and 22 deletions.
diff --git a/api/src/opentrons/protocols/parse.py b/api/src/opentrons/protocols/parse.py
@@ -192,7 +192,9 @@ def version_from_string(vstr: str) -> APIVersion:
     return APIVersion(major=int(matches.group(1)), minor=int(matches.group(2)))
 
 
-def _parse_json(protocol_contents: str, filename: Optional[str] = None) -> JsonProtocol:
+def _parse_json(
+    protocol_contents: Union[str, bytes], filename: Optional[str] = None
+) -> JsonProtocol:
     """Parse a protocol known or at least suspected to be json"""
     protocol_json = json.loads(protocol_contents)
     version, validated = validate_json(protocol_json)
@@ -208,7 +210,7 @@ def _parse_json(protocol_contents: str, filename: Optional[str] = None) -> JsonP
 
 
 def _parse_python(
-    protocol_contents: str,
+    protocol_contents: Union[str, bytes],
     python_parse_mode: PythonParseMode,
     filename: Optional[str] = None,
     bundled_labware: Optional[Dict[str, "LabwareDefinition"]] = None,
@@ -338,28 +340,37 @@ def parse(
             )
         return result
     else:
-        if isinstance(protocol_file, bytes):
-            protocol_str = protocol_file.decode("utf-8")
-        else:
-            protocol_str = protocol_file
-
         if filename and filename.endswith(".json"):
-            return _parse_json(protocol_str, filename)
+            return _parse_json(protocol_file, filename)
         elif filename and filename.endswith(".py"):
             return _parse_python(
-                protocol_contents=protocol_str,
+                protocol_contents=protocol_file,
                 python_parse_mode=python_parse_mode,
                 filename=filename,
                 extra_labware=extra_labware,
                 bundled_data=extra_data,
             )
 
-        # our jsonschema says the top level json kind is object
-        if protocol_str and protocol_str[0] in ("{", b"{"):
-            return _parse_json(protocol_str, filename)
+        # our jsonschema says the top level json kind is object so we can
+        # rely on it starting with a { if it's valid. that could either be
+        # a string or bytes.
+        #
+        # if it's a string, then if the protocol file starts with a { and
+        # we do protocol_file[0] then we get the string "{".
+        #
+        # if it's a bytes, then if the protocol file starts with the ascii or
+        # utf-8 representation of { and we do protocol_file[0] we get 123,
+        # because while single elements of strings are strings, single elements
+        # of bytes are the byte value as a number.
+        #
+        # to get that number we could either use ord() or do what we do here
+        # which I think is a little nicer, if any of the above can be called
+        # "nice".
+        if protocol_file and protocol_file[0] in ("{", b"{"[0]):
+            return _parse_json(protocol_file, filename)
         else:
             return _parse_python(
-                protocol_contents=protocol_str,
+                protocol_contents=protocol_file,
                 python_parse_mode=python_parse_mode,
                 filename=filename,
                 extra_labware=extra_labware,
@@ -499,6 +510,7 @@ def _version_from_static_python_info(
     """
     from_requirements = (static_python_info.requirements or {}).get("apiLevel", None)
     from_metadata = (static_python_info.metadata or {}).get("apiLevel", None)
+
     requested_level = from_requirements or from_metadata
     if requested_level is None:
         return None

diff --git a/api/src/opentrons/protocols/types.py b/api/src/opentrons/protocols/types.py
@@ -31,7 +31,13 @@ class StaticPythonInfo:
 
 @dataclass(frozen=True)
 class _ProtocolCommon:
-    text: str
+    text: Union[str, bytes]
+    """The original text of the protocol file in the format it was specified with.
+
+    This leads to a wide type but it is actually quite important that we do not ever
+    str.decode('utf-8') this because it will break the interpreter's understanding of
+    line numbers for if we have to format an exception.
+    """
 
     filename: Optional[str]
     """The original name of the main protocol file, if it had a name.
@@ -74,7 +80,7 @@ class PythonProtocol(_ProtocolCommon):
 
 
 class BundleContents(NamedTuple):
-    protocol: str
+    protocol: Union[str, bytes]
     bundled_labware: Dict[str, "LabwareDefinition"]
     bundled_data: Dict[str, bytes]
     bundled_python: Dict[str, str]

diff --git a/api/src/opentrons/util/entrypoint_util.py b/api/src/opentrons/util/entrypoint_util.py
@@ -166,7 +166,10 @@ def adapt_protocol_source(protocol: Protocol) -> Generator[ProtocolSource, None,
         # through the filesystem. https://opentrons.atlassian.net/browse/RSS-281
 
         main_file = pathlib.Path(temporary_directory) / main_file_name
-        main_file.write_text(protocol.text, encoding="utf-8")
+        if isinstance(protocol.text, str):
+            main_file.write_text(protocol.text, encoding="utf-8")
+        else:
+            main_file.write_bytes(protocol.text)
 
         labware_files: List[pathlib.Path] = []
         if isinstance(protocol, PythonProtocol) and protocol.extra_labware is not None:

diff --git a/api/tests/opentrons/protocols/test_parse.py b/api/tests/opentrons/protocols/test_parse.py
@@ -1,6 +1,6 @@
 import json
 from textwrap import dedent
-from typing import Any, Callable, Optional, Union
+from typing import Any, Callable, Optional, Union, Literal
 
 import pytest
 from opentrons_shared_data.robot.dev_types import RobotType
@@ -407,7 +407,7 @@ def run(ctx): pass
 @pytest.mark.parametrize("filename", ["protocol.py", None])
 def test_parse_python_details(
     protocol_source: str,
-    protocol_text_kind: str,
+    protocol_text_kind: Literal["str", "bytes"],
     filename: Optional[str],
     expected_api_level: APIVersion,
     expected_robot_type: RobotType,
@@ -423,8 +423,11 @@ def test_parse_python_details(
     parsed = parse(text, filename)
 
     assert isinstance(parsed, PythonProtocol)
-    assert parsed.text == protocol_source
-    assert isinstance(parsed.text, str)
+    assert parsed.text == text
+    if protocol_text_kind == "str":
+        assert isinstance(parsed.text, str)
+    else:
+        assert isinstance(parsed.text, bytes)
 
     assert parsed.filename == filename
     assert parsed.contents.co_filename == (
@@ -454,13 +457,13 @@ def test_parse_json_details(
     get_json_protocol_fixture: Callable[..., Any],
     fixture_version: str,
     fixture_name: str,
-    protocol_text_kind: str,
+    protocol_text_kind: Literal["str", "bytes"],
     filename: str,
 ) -> None:
     protocol = get_json_protocol_fixture(
         fixture_version=fixture_version, fixture_name=fixture_name, decode=False
     )
-    if protocol_text_kind == "text":
+    if protocol_text_kind == "str":
         protocol_text: Union[bytes, str] = protocol
     else:
         protocol_text = protocol.encode("utf-8")