From 58821c7206ca1aaa3d2dc0f5a564137d75b09516 Mon Sep 17 00:00:00 2001 From: Olivier Delalleau <507137+odelalleau@users.noreply.github.com> Date: Thu, 6 May 2021 08:38:53 -0400 Subject: [PATCH 1/4] Grammar: only un-escape \\ in situations where escaping is required This simplifies situations where one wants to obtain a string containing multiple backslashes. --- omegaconf/grammar/OmegaConfGrammarLexer.g4 | 59 ++++++++++++++++----- omegaconf/grammar/OmegaConfGrammarParser.g4 | 2 +- omegaconf/grammar_visitor.py | 7 ++- tests/test_grammar.py | 24 ++++++--- 4 files changed, 70 insertions(+), 22 deletions(-) diff --git a/omegaconf/grammar/OmegaConfGrammarLexer.g4 b/omegaconf/grammar/OmegaConfGrammarLexer.g4 index 50c4f10a6..e7325f8ad 100644 --- a/omegaconf/grammar/OmegaConfGrammarLexer.g4 +++ b/omegaconf/grammar/OmegaConfGrammarLexer.g4 @@ -16,13 +16,23 @@ fragment ESC_BACKSLASH: '\\\\'; // escaped backslash TOP_INTER_OPEN: INTER_OPEN -> type(INTER_OPEN), pushMode(INTERPOLATION_MODE); -ESC_INTER: '\\${'; -TOP_ESC: ESC_BACKSLASH+ -> type(ESC); +// Regular string: anything that does not contain any $ and does not end with \ +// (this ensures this rule will not consume characters required to recognize other tokens). +ANY_STR: ~[$]* ~[\\$]; + +// Escaped interpolation: '\${', optionally preceded by an even number of \ +ESC_INTER: ESC_BACKSLASH* '\\${'; + +// Escaped backslashes: even number of \ followed by an interpolation. +// The interpolation must *not* be matched by this rule (this is why we use a predicate lookahead). +TOP_ESC: ESC_BACKSLASH+ { self._input.LA(1) == ord("$") and self._input.LA(2) == ord("{") }? -> type(ESC); + +// Other backslashes that will not need escaping. +BACKSLASHES: '\\'+ -> type(ANY_STR); + +// The dollar sign must be singled out so that we can recognize interpolations. +DOLLAR: '$' -> type(ANY_STR); -// The backslash and dollar characters must not be grouped with others, so that -// we can properly detect the tokens above. -SPECIAL_CHAR: [\\$]; -ANY_STR: ~[\\$]+; // anything else //////////////// // VALUE_MODE // @@ -96,11 +106,26 @@ mode QUOTED_SINGLE_MODE; QSINGLE_INTER_OPEN: INTER_OPEN -> type(INTER_OPEN), pushMode(INTERPOLATION_MODE); MATCHING_QUOTE_CLOSE: '\'' -> popMode; -QSINGLE_ESC: ('\\\'' | ESC_BACKSLASH)+ -> type(ESC); +// Regular string: anything that does not contain any $ *or quote* and does not end with \ +QSINGLE_STR: ~['$]* ~['\\$] -> type(ANY_STR); + QSINGLE_ESC_INTER: ESC_INTER -> type(ESC_INTER); -QSINGLE_SPECIAL_CHAR: SPECIAL_CHAR -> type(SPECIAL_CHAR); -QSINGLE_STR: ~['\\$]+ -> type(ANY_STR); +// In a quoted string we also have the following escape sequences: +// - \', optionally preceded by an even number of \ (escaped quote) +// - an even number of \ followed by either the closing quote (trailing backslashes) or by +// an interpolation (as in `DEFAULT_MODE`) -- which must not be matched by this rule +QSINGLE_ESC: + ( + ESC_BACKSLASH* '\\\'' | + ESC_BACKSLASH+ {( + self._input.LA(1) == ord("'") or + (self._input.LA(1) == ord("$") and self._input.LA(2) == ord("{")) + )}? + ) -> type(ESC); + +QSINGLE_BACKSLASHES: '\\'+ -> type(ANY_STR); +QSINGLE_DOLLAR: '$' -> type(ANY_STR); //////////////////////// @@ -114,8 +139,18 @@ mode QUOTED_DOUBLE_MODE; QDOUBLE_INTER_OPEN: INTER_OPEN -> type(INTER_OPEN), pushMode(INTERPOLATION_MODE); QDOUBLE_CLOSE: '"' -> type(MATCHING_QUOTE_CLOSE), popMode; -QDOUBLE_ESC: ('\\"' | ESC_BACKSLASH)+ -> type(ESC); +QDOUBLE_STR: ~["$]* ~["\\$] -> type(ANY_STR); + QDOUBLE_ESC_INTER: ESC_INTER -> type(ESC_INTER); -QDOUBLE_SPECIAL_CHAR: SPECIAL_CHAR -> type(SPECIAL_CHAR); -QDOUBLE_STR: ~["\\$]+ -> type(ANY_STR); +QDOUBLE_ESC: + ( + ESC_BACKSLASH* '\\"' | + ESC_BACKSLASH+ {( + self._input.LA(1) == ord('"') or + (self._input.LA(1) == ord("$") and self._input.LA(2) == ord("{")) + )}? + ) -> type(ESC); + +QDOUBLE_BACKSLASHES: '\\'+ -> type(ANY_STR); +QDOUBLE_DOLLAR: '$' -> type(ANY_STR); diff --git a/omegaconf/grammar/OmegaConfGrammarParser.g4 b/omegaconf/grammar/OmegaConfGrammarParser.g4 index 4383da946..5ec70cf07 100644 --- a/omegaconf/grammar/OmegaConfGrammarParser.g4 +++ b/omegaconf/grammar/OmegaConfGrammarParser.g4 @@ -23,7 +23,7 @@ singleElement: element EOF; // Composite text expression (may contain interpolations). -text: (interpolation | ESC | ESC_INTER | SPECIAL_CHAR | ANY_STR)+; +text: (interpolation | ESC | ESC_INTER | ANY_STR)+; // Elements. diff --git a/omegaconf/grammar_visitor.py b/omegaconf/grammar_visitor.py index 4588253ef..8df6bfa70 100644 --- a/omegaconf/grammar_visitor.py +++ b/omegaconf/grammar_visitor.py @@ -289,7 +289,7 @@ def visitSingleElement( return self.visit(ctx.getChild(0)) def visitText(self, ctx: OmegaConfGrammarParser.TextContext) -> Any: - # (interpolation | ESC | ESC_INTER | SPECIAL_CHAR | ANY_STR)+ + # (interpolation | ESC | ESC_INTER | ANY_STR)+ # Single interpolation? If yes, return its resolved value "as is". if ctx.getChildCount() == 1: @@ -356,7 +356,10 @@ def _unescape( if s.type == OmegaConfGrammarLexer.ESC: chrs.append(s.text[1::2]) elif s.type == OmegaConfGrammarLexer.ESC_INTER: - chrs.append(s.text[1:]) + # `ESC_INTER` is of the form `\\...\${`: the formula below computes + # the number of characters to keep at the end of the string to remove + # the correct number of backslashes. + chrs.append(s.text[-(len(s.text) // 2 + 1) :]) else: chrs.append(s.text) else: diff --git a/tests/test_grammar.py b/tests/test_grammar.py index 025511bea..cabf4e54c 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -143,9 +143,14 @@ ("str_quoted_too_many_3", "''a''", GrammarParseError), ("str_quoted_trailing_esc_1", r"'abc\\'", r" abc\ ".strip()), ("str_quoted_trailing_esc_2", r"'abc\\\\'", r" abc\\ ".strip()), - ("str_quoted_no_esc_1", r'"abc\def"', r"abc\def"), - ("str_quoted_no_esc_2", r'"abc\\def"', r"abc\def"), - ("str_quoted_no_esc_3", r'"\\\abc\def"', r"\\abc\def"), + ("str_quoted_no_esc_single_1", r"'abc\def'", r"abc\def"), + ("str_quoted_no_esc_single_2", r"'abc\\def'", r"abc\\def"), + ("str_quoted_no_esc_single_3", r"'\\\abc\def'", r"\\\abc\def"), + ("str_quoted_no_esc_dollar_single", r"'abc\\$$'", r"abc\\$$"), + ("str_quoted_no_esc_double_1", r'"abc\def"', r"abc\def"), + ("str_quoted_no_esc_double_2", r'"abc\\def"', r"abc\\def"), + ("str_quoted_no_esc_double_3", r'"\\\abc\def"', r"\\\abc\def"), + ("str_quoted_no_esc_dollar_double", r'"abc\\$$"', r"abc\\$$"), ("str_quoted_bad_1", r'"abc\"', GrammarParseError), ("str_quoted_bad_2", r'"abc\\\"', GrammarParseError), ("str_quoted_esc_quote_single_1", r"'abc\'def'", "abc'def"), @@ -255,8 +260,10 @@ ("str_quoted_inter", "'${null}'", "None"), ("str_quoted_esc_single_1", r"'ab\'cd\'\'${str}'", "ab'cd''hi"), ("str_quoted_esc_single_2", r"""'\\\${foo}'""", r"\${foo}"), + ("str_quoted_esc_single_3", r"""'\\a_${str}\\'""", r" \\a_hi\ ".strip()), ("str_quoted_esc_double_1", r'"ab\"cd\"\"${str}"', 'ab"cd""hi'), ("str_quoted_esc_double_2", r'''"\\\${foo}"''', r"\${foo}"), + ("str_quoted_esc_double_3", r'''"\\a_${str}\\"''', r" \\a_hi\ ".strip()), ("str_quoted_other_quote_double", """'double"'""", 'double"'), ("str_quoted_other_quote_single", '''"single'"''', "single'"), ("str_quoted_concat_bad_1", '"Hi "${str}', GrammarParseError), @@ -357,13 +364,16 @@ ("str_top_esc_inter", r"Esc: \${str}", "Esc: ${str}"), ("str_top_esc_inter_wrong_1", r"Wrong: $\{str\}", r"Wrong: $\{str\}"), ("str_top_esc_inter_wrong_2", r"Wrong: \${str\}", r"Wrong: ${str\}"), - ("str_top_esc_backslash", r"Esc: \\${str}", r"Esc: \hi"), + ("str_top_esc_backslash_1", r"Esc: \\${str}", r"Esc: \hi"), + ("str_top_esc_backslash_2", r"Esc: \\\\${str}", r"Esc: \\hi"), ("str_top_quoted_braces_wrong", r"Wrong: \{${str}\}", r"Wrong: \{hi\}"), ("str_top_leading_dollars", r"$$${str}", "$$hi"), ("str_top_trailing_dollars", r"${str}$$$$", "hi$$$$"), - ("str_top_leading_escapes", r"\\\\\${str}", r"\\${str}"), - ("str_top_middle_escapes", r"abc\\\\\${str}", r"abc\\${str}"), - ("str_top_trailing_escapes", "${str}" + "\\" * 5, "hi" + "\\" * 3), + ("str_top_leading_escapes_1", r"\\\\\${str}", r"\\${str}"), + ("str_top_leading_escapes_2", r"\\\\ \${str}", r"\\\\ ${str}"), + ("str_top_middle_escapes_1", r"abc\\\\\${str}", r"abc\\${str}"), + ("str_top_middle_escapes_2", r"abc\\\\ \${str}", r"abc\\\\ ${str}"), + ("str_top_trailing_escapes", "${str}" + "\\" * 5, "hi" + "\\" * 5), ("str_top_concat_interpolations", "${null}${float}", "None1.2"), ("str_top_issue_617", r""" ${test: "hi\\" }"} """, r" hi\"} "), # Whitespaces. From f94e3dadc96ebe3d0131cd8a0064ba31018627dc Mon Sep 17 00:00:00 2001 From: Olivier Delalleau <507137+odelalleau@users.noreply.github.com> Date: Thu, 6 May 2021 15:47:47 -0400 Subject: [PATCH 2/4] Split test in two --- tests/test_grammar.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_grammar.py b/tests/test_grammar.py index cabf4e54c..ffe22595f 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -260,10 +260,12 @@ ("str_quoted_inter", "'${null}'", "None"), ("str_quoted_esc_single_1", r"'ab\'cd\'\'${str}'", "ab'cd''hi"), ("str_quoted_esc_single_2", r"""'\\\${foo}'""", r"\${foo}"), - ("str_quoted_esc_single_3", r"""'\\a_${str}\\'""", r" \\a_hi\ ".strip()), + ("str_quoted_esc_single_3", r"""'\\a_${str}'""", r"\\a_hi"), + ("str_quoted_esc_single_4", r"""'a_${str}\\'""", r" a_hi\ ".strip()), ("str_quoted_esc_double_1", r'"ab\"cd\"\"${str}"', 'ab"cd""hi'), ("str_quoted_esc_double_2", r'''"\\\${foo}"''', r"\${foo}"), - ("str_quoted_esc_double_3", r'''"\\a_${str}\\"''', r" \\a_hi\ ".strip()), + ("str_quoted_esc_double_3", r'''"\\a_${str}"''', r"\\a_hi"), + ("str_quoted_esc_double_4", r'''"a_${str}\\"''', r" a_hi\ ".strip()), ("str_quoted_other_quote_double", """'double"'""", 'double"'), ("str_quoted_other_quote_single", '''"single'"''', "single'"), ("str_quoted_concat_bad_1", '"Hi "${str}', GrammarParseError), From 1eb0dcc780c7f0946abe6703bf5c5f2998809912 Mon Sep 17 00:00:00 2001 From: Olivier Delalleau <507137+odelalleau@users.noreply.github.com> Date: Thu, 6 May 2021 15:52:15 -0400 Subject: [PATCH 3/4] Minor test tweak --- tests/test_grammar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_grammar.py b/tests/test_grammar.py index ffe22595f..c45d5e4a7 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -375,7 +375,7 @@ ("str_top_leading_escapes_2", r"\\\\ \${str}", r"\\\\ ${str}"), ("str_top_middle_escapes_1", r"abc\\\\\${str}", r"abc\\${str}"), ("str_top_middle_escapes_2", r"abc\\\\ \${str}", r"abc\\\\ ${str}"), - ("str_top_trailing_escapes", "${str}" + "\\" * 5, "hi" + "\\" * 5), + ("str_top_trailing_escapes", r" ${str}\\\ ".strip(), r" hi\\\ ".strip()), ("str_top_concat_interpolations", "${null}${float}", "None1.2"), ("str_top_issue_617", r""" ${test: "hi\\" }"} """, r" hi\"} "), # Whitespaces. From c5d3a0763c6b97201e6f7dcfc1613e55857abd74 Mon Sep 17 00:00:00 2001 From: Olivier Delalleau <507137+odelalleau@users.noreply.github.com> Date: Thu, 6 May 2021 18:59:25 -0400 Subject: [PATCH 4/4] Move the detection of \ to escape from lexer to visitor --- omegaconf/grammar/OmegaConfGrammarLexer.g4 | 35 ++++----------- omegaconf/grammar/OmegaConfGrammarParser.g4 | 2 +- omegaconf/grammar_visitor.py | 48 +++++++++++++++------ 3 files changed, 45 insertions(+), 40 deletions(-) diff --git a/omegaconf/grammar/OmegaConfGrammarLexer.g4 b/omegaconf/grammar/OmegaConfGrammarLexer.g4 index e7325f8ad..73f616b7c 100644 --- a/omegaconf/grammar/OmegaConfGrammarLexer.g4 +++ b/omegaconf/grammar/OmegaConfGrammarLexer.g4 @@ -23,11 +23,10 @@ ANY_STR: ~[$]* ~[\\$]; // Escaped interpolation: '\${', optionally preceded by an even number of \ ESC_INTER: ESC_BACKSLASH* '\\${'; -// Escaped backslashes: even number of \ followed by an interpolation. -// The interpolation must *not* be matched by this rule (this is why we use a predicate lookahead). -TOP_ESC: ESC_BACKSLASH+ { self._input.LA(1) == ord("$") and self._input.LA(2) == ord("{") }? -> type(ESC); +// Backslashes that *may* be escaped (even number). +TOP_ESC: ESC_BACKSLASH+; -// Other backslashes that will not need escaping. +// Other backslashes that will not need escaping (odd number due to not matching the previous rule). BACKSLASHES: '\\'+ -> type(ANY_STR); // The dollar sign must be singled out so that we can recognize interpolations. @@ -111,19 +110,10 @@ QSINGLE_STR: ~['$]* ~['\\$] -> type(ANY_STR); QSINGLE_ESC_INTER: ESC_INTER -> type(ESC_INTER); -// In a quoted string we also have the following escape sequences: -// - \', optionally preceded by an even number of \ (escaped quote) -// - an even number of \ followed by either the closing quote (trailing backslashes) or by -// an interpolation (as in `DEFAULT_MODE`) -- which must not be matched by this rule -QSINGLE_ESC: - ( - ESC_BACKSLASH* '\\\'' | - ESC_BACKSLASH+ {( - self._input.LA(1) == ord("'") or - (self._input.LA(1) == ord("$") and self._input.LA(2) == ord("{")) - )}? - ) -> type(ESC); +// Escaped quote (optionally preceded by an even number of backslashes). +QSINGLE_ESC_QUOTE: ESC_BACKSLASH* '\\\'' -> type(ESC); +QUOTED_ESC: ESC_BACKSLASH+; QSINGLE_BACKSLASHES: '\\'+ -> type(ANY_STR); QSINGLE_DOLLAR: '$' -> type(ANY_STR); @@ -140,17 +130,8 @@ QDOUBLE_INTER_OPEN: INTER_OPEN -> type(INTER_OPEN), pushMode(INTERPOLATION_MODE) QDOUBLE_CLOSE: '"' -> type(MATCHING_QUOTE_CLOSE), popMode; QDOUBLE_STR: ~["$]* ~["\\$] -> type(ANY_STR); - QDOUBLE_ESC_INTER: ESC_INTER -> type(ESC_INTER); - -QDOUBLE_ESC: - ( - ESC_BACKSLASH* '\\"' | - ESC_BACKSLASH+ {( - self._input.LA(1) == ord('"') or - (self._input.LA(1) == ord("$") and self._input.LA(2) == ord("{")) - )}? - ) -> type(ESC); - +QDOUBLE_ESC_QUOTE: ESC_BACKSLASH* '\\"' -> type(ESC); +QDOUBLE_ESC: ESC_BACKSLASH+ -> type(QUOTED_ESC); QDOUBLE_BACKSLASHES: '\\'+ -> type(ANY_STR); QDOUBLE_DOLLAR: '$' -> type(ANY_STR); diff --git a/omegaconf/grammar/OmegaConfGrammarParser.g4 b/omegaconf/grammar/OmegaConfGrammarParser.g4 index 5ec70cf07..f0f256c9a 100644 --- a/omegaconf/grammar/OmegaConfGrammarParser.g4 +++ b/omegaconf/grammar/OmegaConfGrammarParser.g4 @@ -23,7 +23,7 @@ singleElement: element EOF; // Composite text expression (may contain interpolations). -text: (interpolation | ESC | ESC_INTER | ANY_STR)+; +text: (interpolation | ANY_STR | ESC | ESC_INTER | TOP_ESC | QUOTED_ESC)+; // Elements. diff --git a/omegaconf/grammar_visitor.py b/omegaconf/grammar_visitor.py index 8df6bfa70..1771516dc 100644 --- a/omegaconf/grammar_visitor.py +++ b/omegaconf/grammar_visitor.py @@ -1,12 +1,12 @@ import sys import warnings +from itertools import zip_longest from typing import ( TYPE_CHECKING, Any, Callable, Dict, Generator, - Iterable, List, Optional, Set, @@ -289,7 +289,7 @@ def visitSingleElement( return self.visit(ctx.getChild(0)) def visitText(self, ctx: OmegaConfGrammarParser.TextContext) -> Any: - # (interpolation | ESC | ESC_INTER | ANY_STR)+ + # (interpolation | ANY_STR | ESC | ESC_INTER | TOP_ESC | QUOTED_ESC)+ # Single interpolation? If yes, return its resolved value "as is". if ctx.getChildCount() == 1: @@ -298,7 +298,7 @@ def visitText(self, ctx: OmegaConfGrammarParser.TextContext) -> Any: return self.visitInterpolation(c) # Otherwise, concatenate string representations together. - return self._unescape(ctx.getChildren()) + return self._unescape(list(ctx.getChildren())) def _createPrimitive( self, @@ -336,11 +336,11 @@ def _createPrimitive( raise AssertionError("WS should never be reached") assert False, symbol.type # Concatenation of multiple items ==> un-escape the concatenation. - return self._unescape(ctx.getChildren()) + return self._unescape(list(ctx.getChildren())) def _unescape( self, - seq: Iterable[Union[TerminalNode, OmegaConfGrammarParser.InterpolationContext]], + seq: List[Union[TerminalNode, OmegaConfGrammarParser.InterpolationContext]], ) -> str: """ Concatenate all symbols / interpolations in `seq`, unescaping symbols as needed. @@ -350,19 +350,43 @@ def _unescape( resolving of the interpolation). """ chrs = [] - for node in seq: + for node, next_node in zip_longest(seq, seq[1:]): if isinstance(node, TerminalNode): s = node.symbol - if s.type == OmegaConfGrammarLexer.ESC: - chrs.append(s.text[1::2]) - elif s.type == OmegaConfGrammarLexer.ESC_INTER: + if s.type == OmegaConfGrammarLexer.ESC_INTER: # `ESC_INTER` is of the form `\\...\${`: the formula below computes # the number of characters to keep at the end of the string to remove # the correct number of backslashes. - chrs.append(s.text[-(len(s.text) // 2 + 1) :]) + text = s.text[-(len(s.text) // 2 + 1) :] + elif ( + # Character sequence identified as requiring un-escaping. + s.type == OmegaConfGrammarLexer.ESC + or ( + # At top level, we need to un-escape backslashes that precede + # an interpolation. + s.type == OmegaConfGrammarLexer.TOP_ESC + and isinstance( + next_node, OmegaConfGrammarParser.InterpolationContext + ) + ) + or ( + # In a quoted sring, we need to un-escape backslashes that + # either end the string, or are followed by an interpolation. + s.type == OmegaConfGrammarLexer.QUOTED_ESC + and ( + next_node is None + or isinstance( + next_node, OmegaConfGrammarParser.InterpolationContext + ) + ) + ) + ): + text = s.text[1::2] # un-escape the sequence else: - chrs.append(s.text) + text = s.text # keep the original text else: assert isinstance(node, OmegaConfGrammarParser.InterpolationContext) - chrs.append(str(self.visitInterpolation(node))) + text = str(self.visitInterpolation(node)) + chrs.append(text) + return "".join(chrs)