Simpler escaping in the grammar, and fix to quoted values

* In top-level strings, the only backslashes that can and should be escaped are those preceding an interpolation. For instance, if "x" is a node evaluating to "X": \${x} -> ${x} \\${x} -> \X \\\${x} -> \${x} ${x}\\ -> X\\ * In quoted strings, the only backslashes that can and should be escaped are those preceding a quote of the same type as the enclosing quotes. For instance: "abc\"def" -> abc"def "abc\\" -> abc\ "abc\\\"def" -> abc\"def "abc\\\'def" -> abc\\\'def This also fixes a bug with the parsing of quoted strings (see omry#617). Fixes omry#615 Fixes omry#617
odelalleau · Mar 18, 2021 · 37f0a5f · 37f0a5f
1 parent 074e8dc
commit 37f0a5f
Show file tree

Hide file tree

Showing 4 changed files with 157 additions and 33 deletions.
diff --git a/omegaconf/grammar/OmegaConfGrammarLexer.g4 b/omegaconf/grammar/OmegaConfGrammarLexer.g4
@@ -16,8 +16,8 @@ fragment ESC_BACKSLASH: '\\\\';  // escaped backslash
 
 TOP_INTER_OPEN: INTER_OPEN -> type(INTER_OPEN), pushMode(INTERPOLATION_MODE);
 
-ESC_INTER: '\\${';
-TOP_ESC: ESC_BACKSLASH+ -> type(ESC);
+ESC_INTER: ('\\\\')* '\\${';       // \${, \\\${, \\\\\${, ...
+EVEN_BACKSLASHES: ESC_BACKSLASH+;  // \\, \\\\, \\\\\\, ...
 
 // The backslash and dollar characters must not be grouped with others, so that
 // we can properly detect the tokens above.
@@ -60,9 +60,21 @@ ESC: (ESC_BACKSLASH | '\\(' | '\\)' | '\\[' | '\\]' | '\\{' | '\\}' |
       '\\:' | '\\=' | '\\,' | '\\ ' | '\\\t')+;
 WS: [ \t]+;
 
+// Quoted values for both types of quotes.
+// A quoted value is made of the enclosing quotes, and either:
+//   - nothing else
+//   - an even number of backslashes (meaning they are escaped)
+//   - a sequence of any character, followed by any non-backslash character, and optionally
+//     an even number of backslashes (i.e., also escaped)
+// Examples (right hand side: expected content of the resulting string, after processing):
+//    ""                    -> <empty>
+//    '\\'                  -> \
+//    "\\\\"                -> \\
+//    'abc\\'               -> abc\
+//    "abc\"def\\\'ghi\\\\" -> abc"def\\\'ghi\\
 QUOTED_VALUE:
-      '\'' ('\\\''|.)*? '\'' // Single quotes, can contain escaped single quote : /'
-    | '"' ('\\"'|.)*? '"' ;  // Double quotes, can contain escaped double quote : /"
+      '"' (('\\\\')* | (.)*? ~[\\] ('\\\\')*) '"'     // double quotes
+    | '\'' (('\\\\')* | (.)*? ~[\\] ('\\\\')*) '\'';  // single quotes
 
 ////////////////////////
 // INTERPOLATION_MODE //

diff --git a/omegaconf/grammar/OmegaConfGrammarParser.g4 b/omegaconf/grammar/OmegaConfGrammarParser.g4
@@ -21,7 +21,7 @@ configValue: (toplevelStr | (toplevelStr? (interpolation toplevelStr?)+)) EOF;
 singleElement: element EOF;
 
 // Top-level string (that does not need to be parsed).
-toplevelStr: (ESC | ESC_INTER | TOP_CHAR | TOP_STR)+;
+toplevelStr: (EVEN_BACKSLASHES | ESC_INTER | TOP_CHAR | TOP_STR)+;
 
 // Elements.
 

diff --git a/omegaconf/grammar_visitor.py b/omegaconf/grammar_visitor.py
@@ -1,3 +1,4 @@
+import re
 import sys
 import warnings
 from typing import (
@@ -34,6 +35,16 @@
     )
     sys.exit(1)
 
+# Regex matching trailing backslashes.
+TRAILING_BACKSLASHES = re.compile("(\\\\)+$")
+
+# Regex matching escaped quotes, for each kind of quote.
+# Note that we include *all* backslashes preceding the quote.
+ESCAPED_QUOTE = {
+    "'": re.compile(r"(\\)+'"),  # escaped single quote
+    '"': re.compile(r'(\\)+"'),  # escaped double quote
+}
+
 
 class GrammarVisitor(OmegaConfGrammarParserVisitor):
     def __init__(
@@ -100,14 +111,33 @@ def visitConfigValue(self, ctx: OmegaConfGrammarParser.ConfigValueContext) -> An
         # (toplevelStr | (toplevelStr? (interpolation toplevelStr?)+)) EOF
         # Visit all children (except last one which is EOF)
         vals = [self.visit(c) for c in list(ctx.getChildren())[:-1]]
-        assert vals
-        if len(vals) == 1 and isinstance(
+        n_vals = len(vals)
+        assert n_vals > 0
+
+        if n_vals == 1 and isinstance(
             ctx.getChild(0), OmegaConfGrammarParser.InterpolationContext
         ):
             # Single interpolation: return the result "as is".
             return vals[0]
+
         # Concatenation of multiple components.
-        return "".join(map(str, vals))
+        # When a top-level string is followed by an interpolation, we need to un-escape
+        # any trailing backslash.
+        tokens = []
+        for i, (child, val) in enumerate(zip(ctx.getChildren(), vals)):
+            if isinstance(child, OmegaConfGrammarParser.ToplevelStrContext):
+                if i < n_vals - 1:
+                    # Top-level string followed by an interpolation.
+                    assert isinstance(
+                        ctx.getChild(i + 1), OmegaConfGrammarParser.InterpolationContext
+                    )
+                    tokens.append(self._unescape_trailing_backslashes(val))
+                else:
+                    tokens.append(val)
+            else:
+                tokens.append(str(val))
+
+        return "".join(tokens)
 
     def visitDictKey(self, ctx: OmegaConfGrammarParser.DictKeyContext) -> Any:
         return self._createPrimitive(ctx)
@@ -282,8 +312,19 @@ def visitSingleElement(
         return self.visit(ctx.getChild(0))
 
     def visitToplevelStr(self, ctx: OmegaConfGrammarParser.ToplevelStrContext) -> str:
-        # (ESC | ESC_INTER | TOP_CHAR | TOP_STR)+
-        return self._unescape(ctx.getChildren())
+        # (EVEN_BACKSLASHES | ESC_INTER | TOP_CHAR | TOP_STR)+
+
+        # Concatenate all fragments, un-escaping interpolations.
+        tokens = []
+        for child in ctx.getChildren():
+            txt = child.getText()
+            if child.symbol.type == OmegaConfGrammarLexer.ESC_INTER:
+                # Un-escape the interpolation, e.g. \${ -> ${ or \\\${ -> \${
+                assert len(txt) % 2 == 1
+                txt = txt[-(len(txt) + 1) // 2 :]
+            tokens.append(txt)
+
+        return "".join(tokens)
 
     def _createPrimitive(
         self,
@@ -318,13 +359,13 @@ def _createPrimitive(
             elif symbol.type == OmegaConfGrammarLexer.BOOL:
                 return symbol.text.lower() == "true"
             elif symbol.type == OmegaConfGrammarLexer.ESC:
-                return self._unescape([child])
+                return self._unescape_from_sequence([child])
             elif symbol.type == OmegaConfGrammarLexer.WS:  # pragma: no cover
                 # A single WS should have been "consumed" by another token.
                 raise AssertionError("WS should never be reached")
             assert False, symbol.type
         # Concatenation of multiple items ==> un-escape the concatenation.
-        return self._unescape(ctx.getChildren())
+        return self._unescape_from_sequence(ctx.getChildren())
 
     def _resolve_quoted_string(self, quoted: str) -> str:
         """
@@ -333,23 +374,18 @@ def _resolve_quoted_string(self, quoted: str) -> str:
         # Identify quote type.
         assert len(quoted) >= 2 and quoted[0] == quoted[-1]
         quote_type = quoted[0]
-        assert quote_type in ["'", '"']
-
-        # Un-escape quotes and backslashes within the string (the two kinds of
-        # escapable characters in quoted strings). We do it in two passes:
-        #   1. Replace `\"` with `"` (and similarly for single quotes)
-        #   2. Replace `\\` with `\`
-        # The order is important so that `\\"` is replaced with an escaped quote `\"`.
-        # We also remove the start and end quotes.
-        esc_quote = f"\\{quote_type}"
-        quoted_content = (
-            quoted[1:-1].replace(esc_quote, quote_type).replace("\\\\", "\\")
-        )
+
+        # Remove enclosing quotes.
+        content = quoted[1:-1]
+
+        # Un-escape quotes then trailing backslashes.
+        content = self._unescape_quotes(content, quote_type)
+        content = self._unescape_trailing_backslashes(content)
 
         # Parse the string.
-        return self.quoted_string_callback(quoted_content)
+        return self.quoted_string_callback(content)
 
-    def _unescape(
+    def _unescape_from_sequence(
         self,
         seq: Iterable[Union[TerminalNode, OmegaConfGrammarParser.InterpolationContext]],
     ) -> str:
@@ -374,3 +410,59 @@ def _unescape(
                 assert isinstance(node, OmegaConfGrammarParser.InterpolationContext)
                 chrs.append(str(self.visitInterpolation(node)))
         return "".join(chrs)
+
+    def _unescape_quotes(self, expr: str, quote_type: str) -> str:
+        """
+        Un-escape escaped quotes from an expression.
+
+        Examples:
+            abc              -> abc
+            abc\'def         -> abc'def
+            abc\'\'def\\\'gh -> abc''def\'gh
+        """
+        pattern = ESCAPED_QUOTE[quote_type]
+        match = pattern.search(expr)
+
+        if match is None:
+            return expr
+
+        tokens = []
+        while match is not None:
+            start, stop = match.span()
+            size = stop - start
+            # An escaped quote is made of an odd number of backslashes followed by a
+            # quote, so the total number of matching characters should be even.
+            assert size % 2 == 0
+            # Add characters before the escaped quote.
+            tokens.append(expr[0:start])
+            # Add the escaped quote (un-escaping the backslashes, which can be achieved
+            # by extracting the proper subset of the string).
+            tokens.append(expr[stop - size // 2 : stop])
+            # Move on to next match.
+            expr = expr[stop:]
+            match = pattern.search(expr)
+
+        # Add characters after the last match.
+        tokens.append(expr)
+
+        return "".join(tokens)
+
+    def _unescape_trailing_backslashes(self, expr: str) -> str:
+        """
+        Un-escape trailing backslashes from `expr`.
+
+        Examples:
+            abc          -> abc
+            abc\\        -> abc\
+            abc\\def     -> abc\\def
+            abc\\def\\\\ -> abc\\def\\
+        """
+        match = TRAILING_BACKSLASHES.search(expr)
+        if match is None:
+            return expr
+        start, end = match.span()
+        n_backslashes = end - start
+        # Sanity check: there should be an even number of backslashes at end of string.
+        assert n_backslashes % 2 == 0
+        # Un-escaping backslashes <=> removing half of them.
+        return expr[0 : start + n_backslashes // 2]
diff --git a/tests/test_grammar.py b/tests/test_grammar.py
@@ -137,6 +137,23 @@
     ("str_quoted_too_many_1", "''a'", GrammarParseError),
     ("str_quoted_too_many_2", "'a''", GrammarParseError),
     ("str_quoted_too_many_3", "''a''", GrammarParseError),
+    ("str_quoted_trailing_esc_1", "'abc\\\\'", "abc\\"),
+    ("str_quoted_trailing_esc_2", "'abc\\\\\\\\'", "abc\\\\"),
+    ("str_quoted_no_esc_1", '"abc\\def"', "abc\\def"),
+    ("str_quoted_no_esc_2", '"abc\\\\def"', "abc\\\\def"),
+    ("str_quoted_no_esc_3", '"\\\\\\abc\\def"', "\\\\\\abc\\def"),
+    ("str_quoted_bad_1", '"abc\\"', GrammarParseError),
+    ("str_quoted_bad_2", '"abc\\\\\\"', GrammarParseError),
+    ("str_quoted_esc_quote_single_1", "'abc\\'def'", "abc'def"),
+    ("str_quoted_esc_quote_single_2", "'abc\\\\\\'def'", "abc\\'def"),
+    ("str_quoted_esc_quote_single_3", "'abc\\\\\\\\\\'def'", "abc\\\\'def"),
+    ("str_quoted_esc_quote_single_4", "'a\\'b\\'cdef\\\\\\''", "a'b'cdef\\'"),
+    ("str_quoted_esc_quote_single_bad", "'abc\\\\'def'", GrammarParseError),
+    ("str_quoted_esc_quote_double_1", '"abc\\"def"', 'abc"def'),
+    ("str_quoted_esc_quote_double_2", '"abc\\\\\\"def"', 'abc\\"def'),
+    ("str_quoted_esc_quote_double_3", '"abc\\\\\\\\\\"def"', 'abc\\\\"def'),
+    ("str_quoted_esc_quote_double_4", '"a\\"b\\"cdef\\\\\\""', 'a"b"cdef\\"'),
+    ("str_quoted_esc_quote_double_bad", '"abc\\\\"def"', GrammarParseError),
     # Lists and dictionaries.
     ("list", "[0, 1]", [0, 1]),
     (
@@ -218,9 +235,9 @@
     # Interpolations in quoted strings.
     ("str_quoted_inter", "'${null}'", "None"),
     ("str_quoted_esc_single_1", r"'ab\'cd\'\'${str}'", "ab'cd''hi"),
-    ("str_quoted_esc_single_2", "'\"\\\\\\\\\\${foo}\\ '", r'"\${foo}\ '),
+    ("str_quoted_esc_single_2", "'\"\\\\\\${foo}\\ '", r'"\${foo}\ '),
     ("str_quoted_esc_double_1", r'"ab\"cd\"\"${str}"', 'ab"cd""hi'),
-    ("str_quoted_esc_double_2", '"\'\\\\\\\\\\${foo}\\ "', r"'\${foo}\ "),
+    ("str_quoted_esc_double_2", '"\'\\\\\\${foo}\\ "', r"'\${foo}\ "),
     ("str_quoted_concat_bad_1", '"Hi "${str}', GrammarParseError),
     # Whitespaces.
     ("ws_inter_node_outer", "${ \tdict.a  \t}", 0),
@@ -240,7 +257,7 @@
     ("nested_simple", "${${ref_str}}", "hi"),
     ("nested_select", "${options.${choice}}", "A"),
     ("nested_relative", "${${rel_opt}.b}", "B"),
-    ("str_quoted_nested", r"'AB${test:\'CD${test:\\'EF\\'}GH\'}'", "ABCDEFGH"),
+    ("str_quoted_nested", r"'AB${test:\'CD${test:\\\'EF\\\'}GH\'}'", "ABCDEFGH"),
     # Resolver interpolations.
     ("no_args", "${test:}", []),
     ("space_in_args", "${test:a, b c}", ["a", "b c"]),
@@ -272,8 +289,7 @@
     ("dict_nan_key_2", "${first:{${test:nan}: 0}}", GrammarParseError),
 ]
 
-# Parameters for tests of the "configValue" rule (may contain node
-# interpolations, but no resolvers).
+# Parameters for tests of the "configValue" rule (may contain interpolations).
 PARAMS_CONFIG_VALUE = [
     # String interpolations (top-level).
     ("str_top_basic", "bonjour ${str}", "bonjour hi"),
@@ -298,7 +314,9 @@
     ("str_top_middle_quotes_single", "I like '${str}'", "I like 'hi'"),
     ("str_top_middle_quotes_double", 'I like "${str}"', 'I like "hi"'),
     ("str_top_any_char", "${str} !@\\#$%^&*})][({,/?;", "hi !@\\#$%^&*})][({,/?;"),
-    ("str_top_esc_inter", r"Esc: \${str}", "Esc: ${str}"),
+    ("str_top_esc_inter_1", r"Esc: \${str}", "Esc: ${str}"),
+    ("str_top_esc_inter_2", r"Esc: \\\${str}", r"Esc: \${str}"),
+    ("str_top_esc_inter_3", r"abc\\${str}\${str}\\", r"abc\hi${str}\\"),
     ("str_top_esc_inter_wrong_1", r"Wrong: $\{str\}", r"Wrong: $\{str\}"),
     ("str_top_esc_inter_wrong_2", r"Wrong: \${str\}", r"Wrong: ${str\}"),
     ("str_top_esc_backslash", r"Esc: \\${str}", r"Esc: \hi"),
@@ -307,8 +325,9 @@
     ("str_top_trailing_dollars", r"${str}$$$$", "hi$$$$"),
     ("str_top_leading_escapes", r"\\\\\${str}", r"\\${str}"),
     ("str_top_middle_escapes", r"abc\\\\\${str}", r"abc\\${str}"),
-    ("str_top_trailing_escapes", "${str}" + "\\" * 5, "hi" + "\\" * 3),
+    ("str_top_trailing_escapes", "${str}" + "\\" * 5, "hi" + "\\" * 5),
     ("str_top_concat_interpolations", "${null}${float}", "None1.2"),
+    ("str_top_issue_617", r""" ${test: "hi\\" }"} """, ' hi\\"} '),
     # Whitespaces.
     ("ws_toplevel", "  \tab  ${str} cd  ${int}\t", "  \tab  hi cd  123\t"),
     # Unmatched braces.
@@ -376,6 +395,7 @@ def test_config_value(
         self, restore_resolvers: Any, definition: str, expected: Any
     ) -> None:
         parse_tree, expected_visit = self._parse("configValue", definition, expected)
+        OmegaConf.register_new_resolver("test", self._resolver_test)
         self._visit_with_config(parse_tree, expected_visit)
 
     @parametrize_from(