From 37f0a5f69edbc5a1a41db8a241784f300c0a48d7 Mon Sep 17 00:00:00 2001 From: Olivier Delalleau <507137+odelalleau@users.noreply.github.com> Date: Thu, 18 Mar 2021 15:25:01 -0400 Subject: [PATCH 1/2] Simpler escaping in the grammar, and fix to quoted values * In top-level strings, the only backslashes that can and should be escaped are those preceding an interpolation. For instance, if "x" is a node evaluating to "X": \${x} -> ${x} \\${x} -> \X \\\${x} -> \${x} ${x}\\ -> X\\ * In quoted strings, the only backslashes that can and should be escaped are those preceding a quote of the same type as the enclosing quotes. For instance: "abc\"def" -> abc"def "abc\\" -> abc\ "abc\\\"def" -> abc\"def "abc\\\'def" -> abc\\\'def This also fixes a bug with the parsing of quoted strings (see #617). Fixes #615 Fixes #617 --- omegaconf/grammar/OmegaConfGrammarLexer.g4 | 20 ++- omegaconf/grammar/OmegaConfGrammarParser.g4 | 2 +- omegaconf/grammar_visitor.py | 134 +++++++++++++++++--- tests/test_grammar.py | 34 ++++- 4 files changed, 157 insertions(+), 33 deletions(-) diff --git a/omegaconf/grammar/OmegaConfGrammarLexer.g4 b/omegaconf/grammar/OmegaConfGrammarLexer.g4 index 7a81073f2..50cdec9ec 100644 --- a/omegaconf/grammar/OmegaConfGrammarLexer.g4 +++ b/omegaconf/grammar/OmegaConfGrammarLexer.g4 @@ -16,8 +16,8 @@ fragment ESC_BACKSLASH: '\\\\'; // escaped backslash TOP_INTER_OPEN: INTER_OPEN -> type(INTER_OPEN), pushMode(INTERPOLATION_MODE); -ESC_INTER: '\\${'; -TOP_ESC: ESC_BACKSLASH+ -> type(ESC); +ESC_INTER: ('\\\\')* '\\${'; // \${, \\\${, \\\\\${, ... +EVEN_BACKSLASHES: ESC_BACKSLASH+; // \\, \\\\, \\\\\\, ... // The backslash and dollar characters must not be grouped with others, so that // we can properly detect the tokens above. @@ -60,9 +60,21 @@ ESC: (ESC_BACKSLASH | '\\(' | '\\)' | '\\[' | '\\]' | '\\{' | '\\}' | '\\:' | '\\=' | '\\,' | '\\ ' | '\\\t')+; WS: [ \t]+; +// Quoted values for both types of quotes. +// A quoted value is made of the enclosing quotes, and either: +// - nothing else +// - an even number of backslashes (meaning they are escaped) +// - a sequence of any character, followed by any non-backslash character, and optionally +// an even number of backslashes (i.e., also escaped) +// Examples (right hand side: expected content of the resulting string, after processing): +// "" -> +// '\\' -> \ +// "\\\\" -> \\ +// 'abc\\' -> abc\ +// "abc\"def\\\'ghi\\\\" -> abc"def\\\'ghi\\ QUOTED_VALUE: - '\'' ('\\\''|.)*? '\'' // Single quotes, can contain escaped single quote : /' - | '"' ('\\"'|.)*? '"' ; // Double quotes, can contain escaped double quote : /" + '"' (('\\\\')* | (.)*? ~[\\] ('\\\\')*) '"' // double quotes + | '\'' (('\\\\')* | (.)*? ~[\\] ('\\\\')*) '\''; // single quotes //////////////////////// // INTERPOLATION_MODE // diff --git a/omegaconf/grammar/OmegaConfGrammarParser.g4 b/omegaconf/grammar/OmegaConfGrammarParser.g4 index 22933a953..806fac468 100644 --- a/omegaconf/grammar/OmegaConfGrammarParser.g4 +++ b/omegaconf/grammar/OmegaConfGrammarParser.g4 @@ -21,7 +21,7 @@ configValue: (toplevelStr | (toplevelStr? (interpolation toplevelStr?)+)) EOF; singleElement: element EOF; // Top-level string (that does not need to be parsed). -toplevelStr: (ESC | ESC_INTER | TOP_CHAR | TOP_STR)+; +toplevelStr: (EVEN_BACKSLASHES | ESC_INTER | TOP_CHAR | TOP_STR)+; // Elements. diff --git a/omegaconf/grammar_visitor.py b/omegaconf/grammar_visitor.py index 56111e557..aa8a78eca 100644 --- a/omegaconf/grammar_visitor.py +++ b/omegaconf/grammar_visitor.py @@ -1,3 +1,4 @@ +import re import sys import warnings from typing import ( @@ -34,6 +35,16 @@ ) sys.exit(1) +# Regex matching trailing backslashes. +TRAILING_BACKSLASHES = re.compile("(\\\\)+$") + +# Regex matching escaped quotes, for each kind of quote. +# Note that we include *all* backslashes preceding the quote. +ESCAPED_QUOTE = { + "'": re.compile(r"(\\)+'"), # escaped single quote + '"': re.compile(r'(\\)+"'), # escaped double quote +} + class GrammarVisitor(OmegaConfGrammarParserVisitor): def __init__( @@ -100,14 +111,33 @@ def visitConfigValue(self, ctx: OmegaConfGrammarParser.ConfigValueContext) -> An # (toplevelStr | (toplevelStr? (interpolation toplevelStr?)+)) EOF # Visit all children (except last one which is EOF) vals = [self.visit(c) for c in list(ctx.getChildren())[:-1]] - assert vals - if len(vals) == 1 and isinstance( + n_vals = len(vals) + assert n_vals > 0 + + if n_vals == 1 and isinstance( ctx.getChild(0), OmegaConfGrammarParser.InterpolationContext ): # Single interpolation: return the result "as is". return vals[0] + # Concatenation of multiple components. - return "".join(map(str, vals)) + # When a top-level string is followed by an interpolation, we need to un-escape + # any trailing backslash. + tokens = [] + for i, (child, val) in enumerate(zip(ctx.getChildren(), vals)): + if isinstance(child, OmegaConfGrammarParser.ToplevelStrContext): + if i < n_vals - 1: + # Top-level string followed by an interpolation. + assert isinstance( + ctx.getChild(i + 1), OmegaConfGrammarParser.InterpolationContext + ) + tokens.append(self._unescape_trailing_backslashes(val)) + else: + tokens.append(val) + else: + tokens.append(str(val)) + + return "".join(tokens) def visitDictKey(self, ctx: OmegaConfGrammarParser.DictKeyContext) -> Any: return self._createPrimitive(ctx) @@ -282,8 +312,19 @@ def visitSingleElement( return self.visit(ctx.getChild(0)) def visitToplevelStr(self, ctx: OmegaConfGrammarParser.ToplevelStrContext) -> str: - # (ESC | ESC_INTER | TOP_CHAR | TOP_STR)+ - return self._unescape(ctx.getChildren()) + # (EVEN_BACKSLASHES | ESC_INTER | TOP_CHAR | TOP_STR)+ + + # Concatenate all fragments, un-escaping interpolations. + tokens = [] + for child in ctx.getChildren(): + txt = child.getText() + if child.symbol.type == OmegaConfGrammarLexer.ESC_INTER: + # Un-escape the interpolation, e.g. \${ -> ${ or \\\${ -> \${ + assert len(txt) % 2 == 1 + txt = txt[-(len(txt) + 1) // 2 :] + tokens.append(txt) + + return "".join(tokens) def _createPrimitive( self, @@ -318,13 +359,13 @@ def _createPrimitive( elif symbol.type == OmegaConfGrammarLexer.BOOL: return symbol.text.lower() == "true" elif symbol.type == OmegaConfGrammarLexer.ESC: - return self._unescape([child]) + return self._unescape_from_sequence([child]) elif symbol.type == OmegaConfGrammarLexer.WS: # pragma: no cover # A single WS should have been "consumed" by another token. raise AssertionError("WS should never be reached") assert False, symbol.type # Concatenation of multiple items ==> un-escape the concatenation. - return self._unescape(ctx.getChildren()) + return self._unescape_from_sequence(ctx.getChildren()) def _resolve_quoted_string(self, quoted: str) -> str: """ @@ -333,23 +374,18 @@ def _resolve_quoted_string(self, quoted: str) -> str: # Identify quote type. assert len(quoted) >= 2 and quoted[0] == quoted[-1] quote_type = quoted[0] - assert quote_type in ["'", '"'] - - # Un-escape quotes and backslashes within the string (the two kinds of - # escapable characters in quoted strings). We do it in two passes: - # 1. Replace `\"` with `"` (and similarly for single quotes) - # 2. Replace `\\` with `\` - # The order is important so that `\\"` is replaced with an escaped quote `\"`. - # We also remove the start and end quotes. - esc_quote = f"\\{quote_type}" - quoted_content = ( - quoted[1:-1].replace(esc_quote, quote_type).replace("\\\\", "\\") - ) + + # Remove enclosing quotes. + content = quoted[1:-1] + + # Un-escape quotes then trailing backslashes. + content = self._unescape_quotes(content, quote_type) + content = self._unescape_trailing_backslashes(content) # Parse the string. - return self.quoted_string_callback(quoted_content) + return self.quoted_string_callback(content) - def _unescape( + def _unescape_from_sequence( self, seq: Iterable[Union[TerminalNode, OmegaConfGrammarParser.InterpolationContext]], ) -> str: @@ -374,3 +410,59 @@ def _unescape( assert isinstance(node, OmegaConfGrammarParser.InterpolationContext) chrs.append(str(self.visitInterpolation(node))) return "".join(chrs) + + def _unescape_quotes(self, expr: str, quote_type: str) -> str: + """ + Un-escape escaped quotes from an expression. + + Examples: + abc -> abc + abc\'def -> abc'def + abc\'\'def\\\'gh -> abc''def\'gh + """ + pattern = ESCAPED_QUOTE[quote_type] + match = pattern.search(expr) + + if match is None: + return expr + + tokens = [] + while match is not None: + start, stop = match.span() + size = stop - start + # An escaped quote is made of an odd number of backslashes followed by a + # quote, so the total number of matching characters should be even. + assert size % 2 == 0 + # Add characters before the escaped quote. + tokens.append(expr[0:start]) + # Add the escaped quote (un-escaping the backslashes, which can be achieved + # by extracting the proper subset of the string). + tokens.append(expr[stop - size // 2 : stop]) + # Move on to next match. + expr = expr[stop:] + match = pattern.search(expr) + + # Add characters after the last match. + tokens.append(expr) + + return "".join(tokens) + + def _unescape_trailing_backslashes(self, expr: str) -> str: + """ + Un-escape trailing backslashes from `expr`. + + Examples: + abc -> abc + abc\\ -> abc\ + abc\\def -> abc\\def + abc\\def\\\\ -> abc\\def\\ + """ + match = TRAILING_BACKSLASHES.search(expr) + if match is None: + return expr + start, end = match.span() + n_backslashes = end - start + # Sanity check: there should be an even number of backslashes at end of string. + assert n_backslashes % 2 == 0 + # Un-escaping backslashes <=> removing half of them. + return expr[0 : start + n_backslashes // 2] diff --git a/tests/test_grammar.py b/tests/test_grammar.py index 963b9b11d..cae1d5a65 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -137,6 +137,23 @@ ("str_quoted_too_many_1", "''a'", GrammarParseError), ("str_quoted_too_many_2", "'a''", GrammarParseError), ("str_quoted_too_many_3", "''a''", GrammarParseError), + ("str_quoted_trailing_esc_1", "'abc\\\\'", "abc\\"), + ("str_quoted_trailing_esc_2", "'abc\\\\\\\\'", "abc\\\\"), + ("str_quoted_no_esc_1", '"abc\\def"', "abc\\def"), + ("str_quoted_no_esc_2", '"abc\\\\def"', "abc\\\\def"), + ("str_quoted_no_esc_3", '"\\\\\\abc\\def"', "\\\\\\abc\\def"), + ("str_quoted_bad_1", '"abc\\"', GrammarParseError), + ("str_quoted_bad_2", '"abc\\\\\\"', GrammarParseError), + ("str_quoted_esc_quote_single_1", "'abc\\'def'", "abc'def"), + ("str_quoted_esc_quote_single_2", "'abc\\\\\\'def'", "abc\\'def"), + ("str_quoted_esc_quote_single_3", "'abc\\\\\\\\\\'def'", "abc\\\\'def"), + ("str_quoted_esc_quote_single_4", "'a\\'b\\'cdef\\\\\\''", "a'b'cdef\\'"), + ("str_quoted_esc_quote_single_bad", "'abc\\\\'def'", GrammarParseError), + ("str_quoted_esc_quote_double_1", '"abc\\"def"', 'abc"def'), + ("str_quoted_esc_quote_double_2", '"abc\\\\\\"def"', 'abc\\"def'), + ("str_quoted_esc_quote_double_3", '"abc\\\\\\\\\\"def"', 'abc\\\\"def'), + ("str_quoted_esc_quote_double_4", '"a\\"b\\"cdef\\\\\\""', 'a"b"cdef\\"'), + ("str_quoted_esc_quote_double_bad", '"abc\\\\"def"', GrammarParseError), # Lists and dictionaries. ("list", "[0, 1]", [0, 1]), ( @@ -218,9 +235,9 @@ # Interpolations in quoted strings. ("str_quoted_inter", "'${null}'", "None"), ("str_quoted_esc_single_1", r"'ab\'cd\'\'${str}'", "ab'cd''hi"), - ("str_quoted_esc_single_2", "'\"\\\\\\\\\\${foo}\\ '", r'"\${foo}\ '), + ("str_quoted_esc_single_2", "'\"\\\\\\${foo}\\ '", r'"\${foo}\ '), ("str_quoted_esc_double_1", r'"ab\"cd\"\"${str}"', 'ab"cd""hi'), - ("str_quoted_esc_double_2", '"\'\\\\\\\\\\${foo}\\ "', r"'\${foo}\ "), + ("str_quoted_esc_double_2", '"\'\\\\\\${foo}\\ "', r"'\${foo}\ "), ("str_quoted_concat_bad_1", '"Hi "${str}', GrammarParseError), # Whitespaces. ("ws_inter_node_outer", "${ \tdict.a \t}", 0), @@ -240,7 +257,7 @@ ("nested_simple", "${${ref_str}}", "hi"), ("nested_select", "${options.${choice}}", "A"), ("nested_relative", "${${rel_opt}.b}", "B"), - ("str_quoted_nested", r"'AB${test:\'CD${test:\\'EF\\'}GH\'}'", "ABCDEFGH"), + ("str_quoted_nested", r"'AB${test:\'CD${test:\\\'EF\\\'}GH\'}'", "ABCDEFGH"), # Resolver interpolations. ("no_args", "${test:}", []), ("space_in_args", "${test:a, b c}", ["a", "b c"]), @@ -272,8 +289,7 @@ ("dict_nan_key_2", "${first:{${test:nan}: 0}}", GrammarParseError), ] -# Parameters for tests of the "configValue" rule (may contain node -# interpolations, but no resolvers). +# Parameters for tests of the "configValue" rule (may contain interpolations). PARAMS_CONFIG_VALUE = [ # String interpolations (top-level). ("str_top_basic", "bonjour ${str}", "bonjour hi"), @@ -298,7 +314,9 @@ ("str_top_middle_quotes_single", "I like '${str}'", "I like 'hi'"), ("str_top_middle_quotes_double", 'I like "${str}"', 'I like "hi"'), ("str_top_any_char", "${str} !@\\#$%^&*})][({,/?;", "hi !@\\#$%^&*})][({,/?;"), - ("str_top_esc_inter", r"Esc: \${str}", "Esc: ${str}"), + ("str_top_esc_inter_1", r"Esc: \${str}", "Esc: ${str}"), + ("str_top_esc_inter_2", r"Esc: \\\${str}", r"Esc: \${str}"), + ("str_top_esc_inter_3", r"abc\\${str}\${str}\\", r"abc\hi${str}\\"), ("str_top_esc_inter_wrong_1", r"Wrong: $\{str\}", r"Wrong: $\{str\}"), ("str_top_esc_inter_wrong_2", r"Wrong: \${str\}", r"Wrong: ${str\}"), ("str_top_esc_backslash", r"Esc: \\${str}", r"Esc: \hi"), @@ -307,8 +325,9 @@ ("str_top_trailing_dollars", r"${str}$$$$", "hi$$$$"), ("str_top_leading_escapes", r"\\\\\${str}", r"\\${str}"), ("str_top_middle_escapes", r"abc\\\\\${str}", r"abc\\${str}"), - ("str_top_trailing_escapes", "${str}" + "\\" * 5, "hi" + "\\" * 3), + ("str_top_trailing_escapes", "${str}" + "\\" * 5, "hi" + "\\" * 5), ("str_top_concat_interpolations", "${null}${float}", "None1.2"), + ("str_top_issue_617", r""" ${test: "hi\\" }"} """, ' hi\\"} '), # Whitespaces. ("ws_toplevel", " \tab ${str} cd ${int}\t", " \tab hi cd 123\t"), # Unmatched braces. @@ -376,6 +395,7 @@ def test_config_value( self, restore_resolvers: Any, definition: str, expected: Any ) -> None: parse_tree, expected_visit = self._parse("configValue", definition, expected) + OmegaConf.register_new_resolver("test", self._resolver_test) self._visit_with_config(parse_tree, expected_visit) @parametrize_from( From 7ce9647f3a8d277f6d831ee1f77fc275e144b77a Mon Sep 17 00:00:00 2001 From: Olivier Delalleau <507137+odelalleau@users.noreply.github.com> Date: Thu, 18 Mar 2021 16:05:47 -0400 Subject: [PATCH 2/2] Fix coverage --- omegaconf/grammar_visitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/omegaconf/grammar_visitor.py b/omegaconf/grammar_visitor.py index aa8a78eca..dc3e8696c 100644 --- a/omegaconf/grammar_visitor.py +++ b/omegaconf/grammar_visitor.py @@ -403,7 +403,7 @@ def _unescape_from_sequence( if s.type == OmegaConfGrammarLexer.ESC: chrs.append(s.text[1::2]) elif s.type == OmegaConfGrammarLexer.ESC_INTER: - chrs.append(s.text[1:]) + assert False # escaped interpolations are handled elsewhere else: chrs.append(s.text) else: