From 58821c7206ca1aaa3d2dc0f5a564137d75b09516 Mon Sep 17 00:00:00 2001
From: Olivier Delalleau <507137+odelalleau@users.noreply.github.com>
Date: Thu, 6 May 2021 08:38:53 -0400
Subject: [PATCH 1/4] Grammar: only un-escape \\ in situations where escaping
 is required

This simplifies situations where one wants to obtain a string containing
multiple backslashes.
---
 omegaconf/grammar/OmegaConfGrammarLexer.g4  | 59 ++++++++++++++++-----
 omegaconf/grammar/OmegaConfGrammarParser.g4 |  2 +-
 omegaconf/grammar_visitor.py                |  7 ++-
 tests/test_grammar.py                       | 24 ++++++---
 4 files changed, 70 insertions(+), 22 deletions(-)

diff --git a/omegaconf/grammar/OmegaConfGrammarLexer.g4 b/omegaconf/grammar/OmegaConfGrammarLexer.g4
index 50c4f10a6..e7325f8ad 100644
--- a/omegaconf/grammar/OmegaConfGrammarLexer.g4
+++ b/omegaconf/grammar/OmegaConfGrammarLexer.g4
@@ -16,13 +16,23 @@ fragment ESC_BACKSLASH: '\\\\';  // escaped backslash
 
 TOP_INTER_OPEN: INTER_OPEN -> type(INTER_OPEN), pushMode(INTERPOLATION_MODE);
 
-ESC_INTER: '\\${';
-TOP_ESC: ESC_BACKSLASH+ -> type(ESC);
+// Regular string: anything that does not contain any $ and does not end with \
+// (this ensures this rule will not consume characters required to recognize other tokens).
+ANY_STR: ~[$]* ~[\\$];
+
+// Escaped interpolation: '\${', optionally preceded by an even number of \
+ESC_INTER: ESC_BACKSLASH* '\\${';
+
+// Escaped backslashes: even number of \ followed by an interpolation.
+// The interpolation must *not* be matched by this rule (this is why we use a predicate lookahead).
+TOP_ESC: ESC_BACKSLASH+ { self._input.LA(1) == ord("$") and self._input.LA(2) == ord("{") }? -> type(ESC);
+
+// Other backslashes that will not need escaping.
+BACKSLASHES: '\\'+ -> type(ANY_STR);
+
+// The dollar sign must be singled out so that we can recognize interpolations.
+DOLLAR: '$' -> type(ANY_STR);
 
-// The backslash and dollar characters must not be grouped with others, so that
-// we can properly detect the tokens above.
-SPECIAL_CHAR: [\\$];
-ANY_STR: ~[\\$]+;  // anything else
 
 ////////////////
 // VALUE_MODE //
@@ -96,11 +106,26 @@ mode QUOTED_SINGLE_MODE;
 QSINGLE_INTER_OPEN: INTER_OPEN -> type(INTER_OPEN), pushMode(INTERPOLATION_MODE);
 MATCHING_QUOTE_CLOSE: '\'' -> popMode;
 
-QSINGLE_ESC: ('\\\'' | ESC_BACKSLASH)+ -> type(ESC);
+// Regular string: anything that does not contain any $ *or quote* and does not end with \
+QSINGLE_STR: ~['$]* ~['\\$] -> type(ANY_STR);
+
 QSINGLE_ESC_INTER: ESC_INTER -> type(ESC_INTER);
 
-QSINGLE_SPECIAL_CHAR: SPECIAL_CHAR -> type(SPECIAL_CHAR);
-QSINGLE_STR: ~['\\$]+ -> type(ANY_STR);
+// In a quoted string we also have the following escape sequences:
+//    - \', optionally preceded by an even number of \ (escaped quote)
+//    - an even number of \ followed by either the closing quote (trailing backslashes) or by
+//      an interpolation (as in `DEFAULT_MODE`) -- which must not be matched by this rule
+QSINGLE_ESC:
+    (
+        ESC_BACKSLASH* '\\\'' |
+        ESC_BACKSLASH+ {(
+            self._input.LA(1) == ord("'") or
+            (self._input.LA(1) == ord("$") and self._input.LA(2) == ord("{"))
+        )}?
+    ) -> type(ESC);
+
+QSINGLE_BACKSLASHES: '\\'+ -> type(ANY_STR);
+QSINGLE_DOLLAR: '$' -> type(ANY_STR);
 
 
 ////////////////////////
@@ -114,8 +139,18 @@ mode QUOTED_DOUBLE_MODE;
 QDOUBLE_INTER_OPEN: INTER_OPEN -> type(INTER_OPEN), pushMode(INTERPOLATION_MODE);
 QDOUBLE_CLOSE: '"' -> type(MATCHING_QUOTE_CLOSE), popMode;
 
-QDOUBLE_ESC: ('\\"' | ESC_BACKSLASH)+ -> type(ESC);
+QDOUBLE_STR: ~["$]* ~["\\$] -> type(ANY_STR);
+
 QDOUBLE_ESC_INTER: ESC_INTER -> type(ESC_INTER);
 
-QDOUBLE_SPECIAL_CHAR: SPECIAL_CHAR -> type(SPECIAL_CHAR);
-QDOUBLE_STR: ~["\\$]+ -> type(ANY_STR);
+QDOUBLE_ESC:
+    (
+        ESC_BACKSLASH* '\\"' |
+        ESC_BACKSLASH+ {(
+            self._input.LA(1) == ord('"') or
+            (self._input.LA(1) == ord("$") and self._input.LA(2) == ord("{"))
+        )}?
+    ) -> type(ESC);
+
+QDOUBLE_BACKSLASHES: '\\'+ -> type(ANY_STR);
+QDOUBLE_DOLLAR: '$' -> type(ANY_STR);
diff --git a/omegaconf/grammar/OmegaConfGrammarParser.g4 b/omegaconf/grammar/OmegaConfGrammarParser.g4
index 4383da946..5ec70cf07 100644
--- a/omegaconf/grammar/OmegaConfGrammarParser.g4
+++ b/omegaconf/grammar/OmegaConfGrammarParser.g4
@@ -23,7 +23,7 @@ singleElement: element EOF;
 
 // Composite text expression (may contain interpolations).
 
-text: (interpolation | ESC | ESC_INTER | SPECIAL_CHAR | ANY_STR)+;
+text: (interpolation | ESC | ESC_INTER | ANY_STR)+;
 
 
 // Elements.
diff --git a/omegaconf/grammar_visitor.py b/omegaconf/grammar_visitor.py
index 4588253ef..8df6bfa70 100644
--- a/omegaconf/grammar_visitor.py
+++ b/omegaconf/grammar_visitor.py
@@ -289,7 +289,7 @@ def visitSingleElement(
         return self.visit(ctx.getChild(0))
 
     def visitText(self, ctx: OmegaConfGrammarParser.TextContext) -> Any:
-        # (interpolation | ESC | ESC_INTER | SPECIAL_CHAR | ANY_STR)+
+        # (interpolation | ESC | ESC_INTER | ANY_STR)+
 
         # Single interpolation? If yes, return its resolved value "as is".
         if ctx.getChildCount() == 1:
@@ -356,7 +356,10 @@ def _unescape(
                 if s.type == OmegaConfGrammarLexer.ESC:
                     chrs.append(s.text[1::2])
                 elif s.type == OmegaConfGrammarLexer.ESC_INTER:
-                    chrs.append(s.text[1:])
+                    # `ESC_INTER` is of the form `\\...\${`: the formula below computes
+                    # the number of characters to keep at the end of the string to remove
+                    # the correct number of backslashes.
+                    chrs.append(s.text[-(len(s.text) // 2 + 1) :])
                 else:
                     chrs.append(s.text)
             else:
diff --git a/tests/test_grammar.py b/tests/test_grammar.py
index 025511bea..cabf4e54c 100644
--- a/tests/test_grammar.py
+++ b/tests/test_grammar.py
@@ -143,9 +143,14 @@
     ("str_quoted_too_many_3", "''a''", GrammarParseError),
     ("str_quoted_trailing_esc_1", r"'abc\\'", r" abc\ ".strip()),
     ("str_quoted_trailing_esc_2", r"'abc\\\\'", r" abc\\ ".strip()),
-    ("str_quoted_no_esc_1", r'"abc\def"', r"abc\def"),
-    ("str_quoted_no_esc_2", r'"abc\\def"', r"abc\def"),
-    ("str_quoted_no_esc_3", r'"\\\abc\def"', r"\\abc\def"),
+    ("str_quoted_no_esc_single_1", r"'abc\def'", r"abc\def"),
+    ("str_quoted_no_esc_single_2", r"'abc\\def'", r"abc\\def"),
+    ("str_quoted_no_esc_single_3", r"'\\\abc\def'", r"\\\abc\def"),
+    ("str_quoted_no_esc_dollar_single", r"'abc\\$$'", r"abc\\$$"),
+    ("str_quoted_no_esc_double_1", r'"abc\def"', r"abc\def"),
+    ("str_quoted_no_esc_double_2", r'"abc\\def"', r"abc\\def"),
+    ("str_quoted_no_esc_double_3", r'"\\\abc\def"', r"\\\abc\def"),
+    ("str_quoted_no_esc_dollar_double", r'"abc\\$$"', r"abc\\$$"),
     ("str_quoted_bad_1", r'"abc\"', GrammarParseError),
     ("str_quoted_bad_2", r'"abc\\\"', GrammarParseError),
     ("str_quoted_esc_quote_single_1", r"'abc\'def'", "abc'def"),
@@ -255,8 +260,10 @@
     ("str_quoted_inter", "'${null}'", "None"),
     ("str_quoted_esc_single_1", r"'ab\'cd\'\'${str}'", "ab'cd''hi"),
     ("str_quoted_esc_single_2", r"""'\\\${foo}'""", r"\${foo}"),
+    ("str_quoted_esc_single_3", r"""'\\a_${str}\\'""", r" \\a_hi\ ".strip()),
     ("str_quoted_esc_double_1", r'"ab\"cd\"\"${str}"', 'ab"cd""hi'),
     ("str_quoted_esc_double_2", r'''"\\\${foo}"''', r"\${foo}"),
+    ("str_quoted_esc_double_3", r'''"\\a_${str}\\"''', r" \\a_hi\ ".strip()),
     ("str_quoted_other_quote_double", """'double"'""", 'double"'),
     ("str_quoted_other_quote_single", '''"single'"''', "single'"),
     ("str_quoted_concat_bad_1", '"Hi "${str}', GrammarParseError),
@@ -357,13 +364,16 @@
     ("str_top_esc_inter", r"Esc: \${str}", "Esc: ${str}"),
     ("str_top_esc_inter_wrong_1", r"Wrong: $\{str\}", r"Wrong: $\{str\}"),
     ("str_top_esc_inter_wrong_2", r"Wrong: \${str\}", r"Wrong: ${str\}"),
-    ("str_top_esc_backslash", r"Esc: \\${str}", r"Esc: \hi"),
+    ("str_top_esc_backslash_1", r"Esc: \\${str}", r"Esc: \hi"),
+    ("str_top_esc_backslash_2", r"Esc: \\\\${str}", r"Esc: \\hi"),
     ("str_top_quoted_braces_wrong", r"Wrong: \{${str}\}", r"Wrong: \{hi\}"),
     ("str_top_leading_dollars", r"$$${str}", "$$hi"),
     ("str_top_trailing_dollars", r"${str}$$$$", "hi$$$$"),
-    ("str_top_leading_escapes", r"\\\\\${str}", r"\\${str}"),
-    ("str_top_middle_escapes", r"abc\\\\\${str}", r"abc\\${str}"),
-    ("str_top_trailing_escapes", "${str}" + "\\" * 5, "hi" + "\\" * 3),
+    ("str_top_leading_escapes_1", r"\\\\\${str}", r"\\${str}"),
+    ("str_top_leading_escapes_2", r"\\\\ \${str}", r"\\\\ ${str}"),
+    ("str_top_middle_escapes_1", r"abc\\\\\${str}", r"abc\\${str}"),
+    ("str_top_middle_escapes_2", r"abc\\\\ \${str}", r"abc\\\\ ${str}"),
+    ("str_top_trailing_escapes", "${str}" + "\\" * 5, "hi" + "\\" * 5),
     ("str_top_concat_interpolations", "${null}${float}", "None1.2"),
     ("str_top_issue_617", r""" ${test: "hi\\" }"} """, r" hi\"} "),
     # Whitespaces.

From f94e3dadc96ebe3d0131cd8a0064ba31018627dc Mon Sep 17 00:00:00 2001
From: Olivier Delalleau <507137+odelalleau@users.noreply.github.com>
Date: Thu, 6 May 2021 15:47:47 -0400
Subject: [PATCH 2/4] Split test in two

---
 tests/test_grammar.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/test_grammar.py b/tests/test_grammar.py
index cabf4e54c..ffe22595f 100644
--- a/tests/test_grammar.py
+++ b/tests/test_grammar.py
@@ -260,10 +260,12 @@
     ("str_quoted_inter", "'${null}'", "None"),
     ("str_quoted_esc_single_1", r"'ab\'cd\'\'${str}'", "ab'cd''hi"),
     ("str_quoted_esc_single_2", r"""'\\\${foo}'""", r"\${foo}"),
-    ("str_quoted_esc_single_3", r"""'\\a_${str}\\'""", r" \\a_hi\ ".strip()),
+    ("str_quoted_esc_single_3", r"""'\\a_${str}'""", r"\\a_hi"),
+    ("str_quoted_esc_single_4", r"""'a_${str}\\'""", r" a_hi\ ".strip()),
     ("str_quoted_esc_double_1", r'"ab\"cd\"\"${str}"', 'ab"cd""hi'),
     ("str_quoted_esc_double_2", r'''"\\\${foo}"''', r"\${foo}"),
-    ("str_quoted_esc_double_3", r'''"\\a_${str}\\"''', r" \\a_hi\ ".strip()),
+    ("str_quoted_esc_double_3", r'''"\\a_${str}"''', r"\\a_hi"),
+    ("str_quoted_esc_double_4", r'''"a_${str}\\"''', r" a_hi\ ".strip()),
     ("str_quoted_other_quote_double", """'double"'""", 'double"'),
     ("str_quoted_other_quote_single", '''"single'"''', "single'"),
     ("str_quoted_concat_bad_1", '"Hi "${str}', GrammarParseError),

From 1eb0dcc780c7f0946abe6703bf5c5f2998809912 Mon Sep 17 00:00:00 2001
From: Olivier Delalleau <507137+odelalleau@users.noreply.github.com>
Date: Thu, 6 May 2021 15:52:15 -0400
Subject: [PATCH 3/4] Minor test tweak

---
 tests/test_grammar.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_grammar.py b/tests/test_grammar.py
index ffe22595f..c45d5e4a7 100644
--- a/tests/test_grammar.py
+++ b/tests/test_grammar.py
@@ -375,7 +375,7 @@
     ("str_top_leading_escapes_2", r"\\\\ \${str}", r"\\\\ ${str}"),
     ("str_top_middle_escapes_1", r"abc\\\\\${str}", r"abc\\${str}"),
     ("str_top_middle_escapes_2", r"abc\\\\ \${str}", r"abc\\\\ ${str}"),
-    ("str_top_trailing_escapes", "${str}" + "\\" * 5, "hi" + "\\" * 5),
+    ("str_top_trailing_escapes", r" ${str}\\\ ".strip(), r" hi\\\ ".strip()),
     ("str_top_concat_interpolations", "${null}${float}", "None1.2"),
     ("str_top_issue_617", r""" ${test: "hi\\" }"} """, r" hi\"} "),
     # Whitespaces.

From c5d3a0763c6b97201e6f7dcfc1613e55857abd74 Mon Sep 17 00:00:00 2001
From: Olivier Delalleau <507137+odelalleau@users.noreply.github.com>
Date: Thu, 6 May 2021 18:59:25 -0400
Subject: [PATCH 4/4] Move the detection of \ to escape from lexer to visitor

---
 omegaconf/grammar/OmegaConfGrammarLexer.g4  | 35 ++++-----------
 omegaconf/grammar/OmegaConfGrammarParser.g4 |  2 +-
 omegaconf/grammar_visitor.py                | 48 +++++++++++++++------
 3 files changed, 45 insertions(+), 40 deletions(-)

diff --git a/omegaconf/grammar/OmegaConfGrammarLexer.g4 b/omegaconf/grammar/OmegaConfGrammarLexer.g4
index e7325f8ad..73f616b7c 100644
--- a/omegaconf/grammar/OmegaConfGrammarLexer.g4
+++ b/omegaconf/grammar/OmegaConfGrammarLexer.g4
@@ -23,11 +23,10 @@ ANY_STR: ~[$]* ~[\\$];
 // Escaped interpolation: '\${', optionally preceded by an even number of \
 ESC_INTER: ESC_BACKSLASH* '\\${';
 
-// Escaped backslashes: even number of \ followed by an interpolation.
-// The interpolation must *not* be matched by this rule (this is why we use a predicate lookahead).
-TOP_ESC: ESC_BACKSLASH+ { self._input.LA(1) == ord("$") and self._input.LA(2) == ord("{") }? -> type(ESC);
+// Backslashes that *may* be escaped (even number).
+TOP_ESC: ESC_BACKSLASH+;
 
-// Other backslashes that will not need escaping.
+// Other backslashes that will not need escaping (odd number due to not matching the previous rule).
 BACKSLASHES: '\\'+ -> type(ANY_STR);
 
 // The dollar sign must be singled out so that we can recognize interpolations.
@@ -111,19 +110,10 @@ QSINGLE_STR: ~['$]* ~['\\$] -> type(ANY_STR);
 
 QSINGLE_ESC_INTER: ESC_INTER -> type(ESC_INTER);
 
-// In a quoted string we also have the following escape sequences:
-//    - \', optionally preceded by an even number of \ (escaped quote)
-//    - an even number of \ followed by either the closing quote (trailing backslashes) or by
-//      an interpolation (as in `DEFAULT_MODE`) -- which must not be matched by this rule
-QSINGLE_ESC:
-    (
-        ESC_BACKSLASH* '\\\'' |
-        ESC_BACKSLASH+ {(
-            self._input.LA(1) == ord("'") or
-            (self._input.LA(1) == ord("$") and self._input.LA(2) == ord("{"))
-        )}?
-    ) -> type(ESC);
+// Escaped quote (optionally preceded by an even number of backslashes).
+QSINGLE_ESC_QUOTE: ESC_BACKSLASH* '\\\'' -> type(ESC);
 
+QUOTED_ESC: ESC_BACKSLASH+;
 QSINGLE_BACKSLASHES: '\\'+ -> type(ANY_STR);
 QSINGLE_DOLLAR: '$' -> type(ANY_STR);
 
@@ -140,17 +130,8 @@ QDOUBLE_INTER_OPEN: INTER_OPEN -> type(INTER_OPEN), pushMode(INTERPOLATION_MODE)
 QDOUBLE_CLOSE: '"' -> type(MATCHING_QUOTE_CLOSE), popMode;
 
 QDOUBLE_STR: ~["$]* ~["\\$] -> type(ANY_STR);
-
 QDOUBLE_ESC_INTER: ESC_INTER -> type(ESC_INTER);
-
-QDOUBLE_ESC:
-    (
-        ESC_BACKSLASH* '\\"' |
-        ESC_BACKSLASH+ {(
-            self._input.LA(1) == ord('"') or
-            (self._input.LA(1) == ord("$") and self._input.LA(2) == ord("{"))
-        )}?
-    ) -> type(ESC);
-
+QDOUBLE_ESC_QUOTE: ESC_BACKSLASH* '\\"' -> type(ESC);
+QDOUBLE_ESC: ESC_BACKSLASH+ -> type(QUOTED_ESC);
 QDOUBLE_BACKSLASHES: '\\'+ -> type(ANY_STR);
 QDOUBLE_DOLLAR: '$' -> type(ANY_STR);
diff --git a/omegaconf/grammar/OmegaConfGrammarParser.g4 b/omegaconf/grammar/OmegaConfGrammarParser.g4
index 5ec70cf07..f0f256c9a 100644
--- a/omegaconf/grammar/OmegaConfGrammarParser.g4
+++ b/omegaconf/grammar/OmegaConfGrammarParser.g4
@@ -23,7 +23,7 @@ singleElement: element EOF;
 
 // Composite text expression (may contain interpolations).
 
-text: (interpolation | ESC | ESC_INTER | ANY_STR)+;
+text: (interpolation | ANY_STR | ESC | ESC_INTER | TOP_ESC | QUOTED_ESC)+;
 
 
 // Elements.
diff --git a/omegaconf/grammar_visitor.py b/omegaconf/grammar_visitor.py
index 8df6bfa70..1771516dc 100644
--- a/omegaconf/grammar_visitor.py
+++ b/omegaconf/grammar_visitor.py
@@ -1,12 +1,12 @@
 import sys
 import warnings
+from itertools import zip_longest
 from typing import (
     TYPE_CHECKING,
     Any,
     Callable,
     Dict,
     Generator,
-    Iterable,
     List,
     Optional,
     Set,
@@ -289,7 +289,7 @@ def visitSingleElement(
         return self.visit(ctx.getChild(0))
 
     def visitText(self, ctx: OmegaConfGrammarParser.TextContext) -> Any:
-        # (interpolation | ESC | ESC_INTER | ANY_STR)+
+        # (interpolation | ANY_STR | ESC | ESC_INTER | TOP_ESC | QUOTED_ESC)+
 
         # Single interpolation? If yes, return its resolved value "as is".
         if ctx.getChildCount() == 1:
@@ -298,7 +298,7 @@ def visitText(self, ctx: OmegaConfGrammarParser.TextContext) -> Any:
                 return self.visitInterpolation(c)
 
         # Otherwise, concatenate string representations together.
-        return self._unescape(ctx.getChildren())
+        return self._unescape(list(ctx.getChildren()))
 
     def _createPrimitive(
         self,
@@ -336,11 +336,11 @@ def _createPrimitive(
                 raise AssertionError("WS should never be reached")
             assert False, symbol.type
         # Concatenation of multiple items ==> un-escape the concatenation.
-        return self._unescape(ctx.getChildren())
+        return self._unescape(list(ctx.getChildren()))
 
     def _unescape(
         self,
-        seq: Iterable[Union[TerminalNode, OmegaConfGrammarParser.InterpolationContext]],
+        seq: List[Union[TerminalNode, OmegaConfGrammarParser.InterpolationContext]],
     ) -> str:
         """
         Concatenate all symbols / interpolations in `seq`, unescaping symbols as needed.
@@ -350,19 +350,43 @@ def _unescape(
         resolving of the interpolation).
         """
         chrs = []
-        for node in seq:
+        for node, next_node in zip_longest(seq, seq[1:]):
             if isinstance(node, TerminalNode):
                 s = node.symbol
-                if s.type == OmegaConfGrammarLexer.ESC:
-                    chrs.append(s.text[1::2])
-                elif s.type == OmegaConfGrammarLexer.ESC_INTER:
+                if s.type == OmegaConfGrammarLexer.ESC_INTER:
                     # `ESC_INTER` is of the form `\\...\${`: the formula below computes
                     # the number of characters to keep at the end of the string to remove
                     # the correct number of backslashes.
-                    chrs.append(s.text[-(len(s.text) // 2 + 1) :])
+                    text = s.text[-(len(s.text) // 2 + 1) :]
+                elif (
+                    # Character sequence identified as requiring un-escaping.
+                    s.type == OmegaConfGrammarLexer.ESC
+                    or (
+                        # At top level, we need to un-escape backslashes that precede
+                        # an interpolation.
+                        s.type == OmegaConfGrammarLexer.TOP_ESC
+                        and isinstance(
+                            next_node, OmegaConfGrammarParser.InterpolationContext
+                        )
+                    )
+                    or (
+                        # In a quoted sring, we need to un-escape backslashes that
+                        # either end the string, or are followed by an interpolation.
+                        s.type == OmegaConfGrammarLexer.QUOTED_ESC
+                        and (
+                            next_node is None
+                            or isinstance(
+                                next_node, OmegaConfGrammarParser.InterpolationContext
+                            )
+                        )
+                    )
+                ):
+                    text = s.text[1::2]  # un-escape the sequence
                 else:
-                    chrs.append(s.text)
+                    text = s.text  # keep the original text
             else:
                 assert isinstance(node, OmegaConfGrammarParser.InterpolationContext)
-                chrs.append(str(self.visitInterpolation(node)))
+                text = str(self.visitInterpolation(node))
+            chrs.append(text)
+
         return "".join(chrs)