From 7424df5d64f6b0a7213df67c078d47e2cf855e27 Mon Sep 17 00:00:00 2001 From: Olivier Delalleau <507137+odelalleau@users.noreply.github.com> Date: Wed, 17 Mar 2021 11:51:08 -0400 Subject: [PATCH] Simplify the escaping of backslashes in quoted strings Fixes #615 --- omegaconf/base.py | 12 +- omegaconf/grammar/OmegaConfGrammarParser.g4 | 2 +- omegaconf/grammar_visitor.py | 16 +-- tests/test_grammar.py | 140 ++++++++++---------- 4 files changed, 85 insertions(+), 85 deletions(-) diff --git a/omegaconf/base.py b/omegaconf/base.py index 4a0a213fd..c816728c2 100644 --- a/omegaconf/base.py +++ b/omegaconf/base.py @@ -610,10 +610,13 @@ def _maybe_resolve_interpolation( key: Any, value: "Node", throw_on_resolution_failure: bool, + force_resolution: bool = False, ) -> Any: - value_kind = get_value_kind(value) - if value_kind != ValueKind.INTERPOLATION: - return value + if not force_resolution: + # Skip resolution if not an interpolation. + value_kind = get_value_kind(value) + if value_kind != ValueKind.INTERPOLATION: + return value parse_tree = parse(_get_value(value)) return self._resolve_interpolation_from_parse_tree( @@ -662,6 +665,9 @@ def quoted_string_callback(quoted_str: str) -> str: is_optional=True, ), throw_on_resolution_failure=True, + # We must always process the string as if it contained an + # interpolation, for proper un-escaping of backslashes (\\). + force_resolution=True, ) return str(quoted_val) diff --git a/omegaconf/grammar/OmegaConfGrammarParser.g4 b/omegaconf/grammar/OmegaConfGrammarParser.g4 index 22933a953..8474233f5 100644 --- a/omegaconf/grammar/OmegaConfGrammarParser.g4 +++ b/omegaconf/grammar/OmegaConfGrammarParser.g4 @@ -17,7 +17,7 @@ options {tokenVocab = OmegaConfGrammarLexer;} // Main rules used to parse OmegaConf strings. -configValue: (toplevelStr | (toplevelStr? (interpolation toplevelStr?)+)) EOF; +configValue: (toplevelStr | (toplevelStr? (interpolation toplevelStr?)+))? EOF; singleElement: element EOF; // Top-level string (that does not need to be parsed). diff --git a/omegaconf/grammar_visitor.py b/omegaconf/grammar_visitor.py index 56111e557..4d173eefc 100644 --- a/omegaconf/grammar_visitor.py +++ b/omegaconf/grammar_visitor.py @@ -97,10 +97,9 @@ def visitConfigKey(self, ctx: OmegaConfGrammarParser.ConfigKeyContext) -> str: return child.symbol.text def visitConfigValue(self, ctx: OmegaConfGrammarParser.ConfigValueContext) -> Any: - # (toplevelStr | (toplevelStr? (interpolation toplevelStr?)+)) EOF + # (toplevelStr | (toplevelStr? (interpolation toplevelStr?)+))? EOF # Visit all children (except last one which is EOF) vals = [self.visit(c) for c in list(ctx.getChildren())[:-1]] - assert vals if len(vals) == 1 and isinstance( ctx.getChild(0), OmegaConfGrammarParser.InterpolationContext ): @@ -335,16 +334,11 @@ def _resolve_quoted_string(self, quoted: str) -> str: quote_type = quoted[0] assert quote_type in ["'", '"'] - # Un-escape quotes and backslashes within the string (the two kinds of - # escapable characters in quoted strings). We do it in two passes: - # 1. Replace `\"` with `"` (and similarly for single quotes) - # 2. Replace `\\` with `\` - # The order is important so that `\\"` is replaced with an escaped quote `\"`. - # We also remove the start and end quotes. + # Remove start and end quotes and un-escape quotes within the string. + # Note that, importantly, we do *not* un-escape backslashes (\\) here: + # instead, we rely on `quoted_string_callback()` to take care of these. esc_quote = f"\\{quote_type}" - quoted_content = ( - quoted[1:-1].replace(esc_quote, quote_type).replace("\\\\", "\\") - ) + quoted_content = quoted[1:-1].replace(esc_quote, quote_type) # Parse the string. return self.quoted_string_callback(quoted_content) diff --git a/tests/test_grammar.py b/tests/test_grammar.py index 963b9b11d..db0d6ae83 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -52,7 +52,7 @@ ) -# Parameters for tests of the "singleElement" rule when there is no interpolation. +# Parameters for tests of the "singleElement" rule when there is no need for callbacks. # Each item is a tuple with three elements: # - The id of the test. # - The expression to be evaluated. @@ -62,7 +62,7 @@ # visiting (= evaluating) the parse tree. If the expected behavior is for the parsing # to succeed, but a `GrammarParseError` to be raised when visiting it, then set the # expected result to the pair `(None, GrammarParseError)`. -PARAMS_SINGLE_ELEMENT_NO_INTERPOLATION: List[Tuple[str, str, Any]] = [ +PARAMS_SINGLE_ELEMENT_NO_CALLBACK: List[Tuple[str, str, Any]] = [ # Special keywords. ("null", "null", None), ("true", "TrUe", True), @@ -121,22 +121,6 @@ ("str_backslash_noesc", r"ab\cd", r"ab\cd"), ("str_esc_illegal_1", r"\#", GrammarParseError), ("str_esc_illegal_2", "\\'\\\"", GrammarParseError), - # Quoted strings. - ("str_quoted_single", "'!@#$%^&*()[]:.,\"'", '!@#$%^&*()[]:.,"'), - ("str_quoted_double", '"!@#$%^&*()[]:.,\'"', "!@#$%^&*()[]:.,'"), - ("str_quoted_outer_ws_single", "' a \t'", " a \t"), - ("str_quoted_outer_ws_double", '" a \t"', " a \t"), - ("str_quoted_int", "'123'", "123"), - ("str_quoted_null", "'null'", "null"), - ("str_quoted_bool", "['truE', \"FalSe\"]", ["truE", "FalSe"]), - ("str_quoted_list", "'[a,b, c]'", "[a,b, c]"), - ("str_quoted_dict", '"{a:b, c: d}"', "{a:b, c: d}"), - ("str_quoted_backslash_noesc_single", r"'a\b'", r"a\b"), - ("str_quoted_backslash_noesc_double", r'"a\b"', r"a\b"), - ("str_quoted_concat_bad_2", "'Hi''there'", GrammarParseError), - ("str_quoted_too_many_1", "''a'", GrammarParseError), - ("str_quoted_too_many_2", "'a''", GrammarParseError), - ("str_quoted_too_many_3", "''a''", GrammarParseError), # Lists and dictionaries. ("list", "[0, 1]", [0, 1]), ( @@ -149,37 +133,10 @@ "{a0-null-1-3.14-NaN- \t-true-False-/\\+.$%*@\\(\\)\\[\\]\\{\\}\\:\\=\\ \\\t\\,:0}", {"a0-null-1-3.14-NaN- \t-true-False-/\\+.$%*@()[]{}:= \t,": 0}, ), - ( - "dict_quoted", - "{0: 1, 'a': 'b', 1.1: 1e2, null: 0.1, true: false, -inf: true}", - {0: 1, "a": "b", 1.1: 100.0, None: 0.1, True: False, -math.inf: True}, - ), - ( - "structured_mixed", - "[10,str,3.14,true,false,inf,[1,2,3], 'quoted', \"quoted\", 'a,b,c']", - [ - 10, - "str", - 3.14, - True, - False, - math.inf, - [1, 2, 3], - "quoted", - "quoted", - "a,b,c", - ], - ), ("dict_int_key", "{0: 0}", {0: 0}), ("dict_float_key", "{1.1: 0}", {1.1: 0}), ("dict_null_key", "{null: 0}", {None: 0}), - ("dict_nan_like_key", "{'nan': 0}", {"nan": 0}), ("dict_list_as_key", "{[0]: 1}", GrammarParseError), - ( - "dict_bool_key", - "{true: true, false: 'false'}", - {True: True, False: "false"}, - ), ("empty_dict", "{}", {}), ("empty_list", "[]", []), ( @@ -189,8 +146,8 @@ ), ] -# Parameters for tests of the "singleElement" rule when there are interpolations. -PARAMS_SINGLE_ELEMENT_WITH_INTERPOLATION = [ +# Parameters for tests of the "singleElement" rule when callbacks are required. +PARAMS_SINGLE_ELEMENT_WITH_CALLBACK = [ # Node interpolations. ("dict_access", "${dict.a}", 0), ("list_access", "${list.0}", -1), @@ -206,21 +163,67 @@ ("dotpath_bad_type", "${dict.${float}}", (None, InterpolationResolutionError)), ("at_in_key", "${x@y}", 123), ("dollar_in_key", "${$x$y$z$}", 456), - # Interpolations in dictionaries. + # Dictionaries. ("dict_interpolation_value", "{hi: ${str}, int: ${int}}", {"hi": "hi", "int": 123}), ("dict_interpolation_key", "{${str}: 0, ${null}: 1", GrammarParseError), - # Interpolations in lists. + ("dict_bool_key", "{true: true, false: 'false'}", {True: True, False: "false"}), + ( + "dict_quoted", + "{0: 1, 'a': 'b', 1.1: 1e2, null: 0.1, true: false, -inf: true}", + {0: 1, "a": "b", 1.1: 100.0, None: 0.1, True: False, -math.inf: True}, + ), + # NaN as dictionary key (a resolver is used here to output only the key). + ("dict_nan_key_1", "${first:{nan: 0}}", math.nan), + ("dict_nan_key_2", "${first:{${test:nan}: 0}}", GrammarParseError), + ("dict_nan_like_key", "{'nan': 0}", {"nan": 0}), + # Lists. ("list_interpolation", "[${str}, ${int}]", ["hi", 123]), - # Interpolations in unquoted strings. + ( + "structured_mixed", + "[10,str,3.14,true,false,inf,[1,2,3], 'quoted', \"quoted\", 'a,b,c']", + [ + 10, + "str", + 3.14, + True, + False, + math.inf, + [1, 2, 3], + "quoted", + "quoted", + "a,b,c", + ], + ), + # Unquoted strings. ("str_dollar_and_inter", "$$${str}", "$$hi"), ("str_inter", "hi_${str}", "hi_hi"), ("str_esc_illegal_3", r"\${foo\}", GrammarParseError), - # Interpolations in quoted strings. + # Quoted strings without interpolations. + ("str_quoted_single", "'!@#$%^&*()[]:.,\"'", '!@#$%^&*()[]:.,"'), + ("str_quoted_double", '"!@#$%^&*()[]:.,\'"', "!@#$%^&*()[]:.,'"), + ("str_quoted_outer_ws_single", "' a \t'", " a \t"), + ("str_quoted_outer_ws_double", '" a \t"', " a \t"), + ("str_quoted_int", "'123'", "123"), + ("str_quoted_null", "'null'", "null"), + ("str_quoted_bool", "['truE', \"FalSe\"]", ["truE", "FalSe"]), + ("str_quoted_list", "'[a,b, c]'", "[a,b, c]"), + ("str_quoted_dict", '"{a:b, c: d}"', "{a:b, c: d}"), + ("str_quoted_backslash_noesc_single", r"'a\b'", r"a\b"), + ("str_quoted_backslash_noesc_double", r'"a\b"', r"a\b"), + ("str_quoted_esc_backslash_1", r"'a\\b'", r"a\b"), + ("str_quoted_esc_backslash_2", r"'a\\\b'", r"a\\b"), + ("str_quoted_esc_backslash_3", r"'a\\\\b'", r"a\\b"), + ("str_quoted_concat_bad_2", "'Hi''there'", GrammarParseError), + ("str_quoted_too_many_1", "''a'", GrammarParseError), + ("str_quoted_too_many_2", "'a''", GrammarParseError), + ("str_quoted_too_many_3", "''a''", GrammarParseError), + ("str_quoted_empty", "''", ""), + # Quoted strings with interpolations. ("str_quoted_inter", "'${null}'", "None"), ("str_quoted_esc_single_1", r"'ab\'cd\'\'${str}'", "ab'cd''hi"), - ("str_quoted_esc_single_2", "'\"\\\\\\\\\\${foo}\\ '", r'"\${foo}\ '), + ("str_quoted_esc_single_2", "'\"\\\\\\${foo}\\ '", r'"\${foo}\ '), ("str_quoted_esc_double_1", r'"ab\"cd\"\"${str}"', 'ab"cd""hi'), - ("str_quoted_esc_double_2", '"\'\\\\\\\\\\${foo}\\ "', r"'\${foo}\ "), + ("str_quoted_esc_double_2", '"\'\\\\\\${foo}\\ "', r"'\${foo}\ "), ("str_quoted_concat_bad_1", '"Hi "${str}', GrammarParseError), # Whitespaces. ("ws_inter_node_outer", "${ \tdict.a \t}", 0), @@ -267,9 +270,6 @@ ("float_resolver_noquote", "${1.1:1,2,3}", GrammarParseError), ("float_resolver_exp", "${1e1:1,2,3}", GrammarParseError), ("inter_float_resolver", "${${float}:1,2,3}", (None, InterpolationResolutionError)), - # NaN as dictionary key (a resolver is used here to output only the key). - ("dict_nan_key_1", "${first:{nan: 0}}", math.nan), - ("dict_nan_key_2", "${first:{${test:nan}: 0}}", GrammarParseError), ] # Parameters for tests of the "configValue" rule (may contain node @@ -309,6 +309,7 @@ ("str_top_middle_escapes", r"abc\\\\\${str}", r"abc\\${str}"), ("str_top_trailing_escapes", "${str}" + "\\" * 5, "hi" + "\\" * 3), ("str_top_concat_interpolations", "${null}${float}", "None1.2"), + ("str_top_empty", "", ""), # Whitespaces. ("ws_toplevel", " \tab ${str} cd ${int}\t", " \tab hi cd 123\t"), # Unmatched braces. @@ -333,34 +334,33 @@ class TestOmegaConfGrammar: Test most grammar constructs. Each method in this class tests the validity of expressions in a specific - setting. For instance, `test_single_element_no_interpolation()` tests the - "singleElement" parsing rule on expressions that do not contain interpolations + setting. For instance, `test_single_element_no_callback()` tests the + "singleElement" parsing rule on expressions that do not need callbacks (which allows for faster tests without using any config object). Tests that actually need a config object all re-use the same `BASE_TEST_CFG` config, to avoid creating a new config for each test. """ - @parametrize_from(PARAMS_SINGLE_ELEMENT_NO_INTERPOLATION) - def test_single_element_no_interpolation( - self, definition: str, expected: Any - ) -> None: + @parametrize_from(PARAMS_SINGLE_ELEMENT_NO_CALLBACK) + def test_single_element_no_callback(self, definition: str, expected: Any) -> None: parse_tree, expected_visit = self._parse("singleElement", definition, expected) if parse_tree is None: return - # Since there are no interpolations here, we do not need to provide - # callbacks to resolve them, and the quoted string callback can simply - # be the identity. + def fake_callback(*args: Any, **kw: Any) -> None: + assert False # must not be called + visitor = grammar_visitor.GrammarVisitor( - node_interpolation_callback=None, # type: ignore - resolver_interpolation_callback=None, # type: ignore - quoted_string_callback=lambda s: s, + node_interpolation_callback=fake_callback, + resolver_interpolation_callback=fake_callback, + quoted_string_callback=fake_callback, # type: ignore ) + self._visit(lambda: visitor.visit(parse_tree), expected_visit) - @parametrize_from(PARAMS_SINGLE_ELEMENT_WITH_INTERPOLATION) - def test_single_element_with_resolver( + @parametrize_from(PARAMS_SINGLE_ELEMENT_WITH_CALLBACK) + def test_single_element_with_callback( self, restore_resolvers: Any, definition: str, expected: Any ) -> None: parse_tree, expected_visit = self._parse("singleElement", definition, expected)