Skip to content

Commit

Permalink
Grammar: only un-escape \\ in situations where escaping is required (#…
Browse files Browse the repository at this point in the history
…708)

Grammar: only un-escape \\ in situations where escaping is required

This simplifies situations where one wants to obtain a string containing
multiple backslashes.
  • Loading branch information
odelalleau authored May 7, 2021
1 parent d5c1d28 commit e4681ef
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 33 deletions.
42 changes: 29 additions & 13 deletions omegaconf/grammar/OmegaConfGrammarLexer.g4
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,22 @@ fragment ESC_BACKSLASH: '\\\\'; // escaped backslash

TOP_INTER_OPEN: INTER_OPEN -> type(INTER_OPEN), pushMode(INTERPOLATION_MODE);

ESC_INTER: '\\${';
TOP_ESC: ESC_BACKSLASH+ -> type(ESC);
// Regular string: anything that does not contain any $ and does not end with \
// (this ensures this rule will not consume characters required to recognize other tokens).
ANY_STR: ~[$]* ~[\\$];

// Escaped interpolation: '\${', optionally preceded by an even number of \
ESC_INTER: ESC_BACKSLASH* '\\${';

// Backslashes that *may* be escaped (even number).
TOP_ESC: ESC_BACKSLASH+;

// Other backslashes that will not need escaping (odd number due to not matching the previous rule).
BACKSLASHES: '\\'+ -> type(ANY_STR);

// The dollar sign must be singled out so that we can recognize interpolations.
DOLLAR: '$' -> type(ANY_STR);

// The backslash and dollar characters must not be grouped with others, so that
// we can properly detect the tokens above.
SPECIAL_CHAR: [\\$];
ANY_STR: ~[\\$]+; // anything else

////////////////
// VALUE_MODE //
Expand Down Expand Up @@ -96,11 +105,17 @@ mode QUOTED_SINGLE_MODE;
QSINGLE_INTER_OPEN: INTER_OPEN -> type(INTER_OPEN), pushMode(INTERPOLATION_MODE);
MATCHING_QUOTE_CLOSE: '\'' -> popMode;

QSINGLE_ESC: ('\\\'' | ESC_BACKSLASH)+ -> type(ESC);
// Regular string: anything that does not contain any $ *or quote* and does not end with \
QSINGLE_STR: ~['$]* ~['\\$] -> type(ANY_STR);

QSINGLE_ESC_INTER: ESC_INTER -> type(ESC_INTER);

QSINGLE_SPECIAL_CHAR: SPECIAL_CHAR -> type(SPECIAL_CHAR);
QSINGLE_STR: ~['\\$]+ -> type(ANY_STR);
// Escaped quote (optionally preceded by an even number of backslashes).
QSINGLE_ESC_QUOTE: ESC_BACKSLASH* '\\\'' -> type(ESC);

QUOTED_ESC: ESC_BACKSLASH+;
QSINGLE_BACKSLASHES: '\\'+ -> type(ANY_STR);
QSINGLE_DOLLAR: '$' -> type(ANY_STR);


////////////////////////
Expand All @@ -114,8 +129,9 @@ mode QUOTED_DOUBLE_MODE;
QDOUBLE_INTER_OPEN: INTER_OPEN -> type(INTER_OPEN), pushMode(INTERPOLATION_MODE);
QDOUBLE_CLOSE: '"' -> type(MATCHING_QUOTE_CLOSE), popMode;

QDOUBLE_ESC: ('\\"' | ESC_BACKSLASH)+ -> type(ESC);
QDOUBLE_STR: ~["$]* ~["\\$] -> type(ANY_STR);
QDOUBLE_ESC_INTER: ESC_INTER -> type(ESC_INTER);
QDOUBLE_SPECIAL_CHAR: SPECIAL_CHAR -> type(SPECIAL_CHAR);
QDOUBLE_STR: ~["\\$]+ -> type(ANY_STR);
QDOUBLE_ESC_QUOTE: ESC_BACKSLASH* '\\"' -> type(ESC);
QDOUBLE_ESC: ESC_BACKSLASH+ -> type(QUOTED_ESC);
QDOUBLE_BACKSLASHES: '\\'+ -> type(ANY_STR);
QDOUBLE_DOLLAR: '$' -> type(ANY_STR);
2 changes: 1 addition & 1 deletion omegaconf/grammar/OmegaConfGrammarParser.g4
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ singleElement: element EOF;

// Composite text expression (may contain interpolations).

text: (interpolation | ESC | ESC_INTER | SPECIAL_CHAR | ANY_STR)+;
text: (interpolation | ANY_STR | ESC | ESC_INTER | TOP_ESC | QUOTED_ESC)+;


// Elements.
Expand Down
51 changes: 39 additions & 12 deletions omegaconf/grammar_visitor.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import sys
import warnings
from itertools import zip_longest
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Generator,
Iterable,
List,
Optional,
Set,
Expand Down Expand Up @@ -289,7 +289,7 @@ def visitSingleElement(
return self.visit(ctx.getChild(0))

def visitText(self, ctx: OmegaConfGrammarParser.TextContext) -> Any:
# (interpolation | ESC | ESC_INTER | SPECIAL_CHAR | ANY_STR)+
# (interpolation | ANY_STR | ESC | ESC_INTER | TOP_ESC | QUOTED_ESC)+

# Single interpolation? If yes, return its resolved value "as is".
if ctx.getChildCount() == 1:
Expand All @@ -298,7 +298,7 @@ def visitText(self, ctx: OmegaConfGrammarParser.TextContext) -> Any:
return self.visitInterpolation(c)

# Otherwise, concatenate string representations together.
return self._unescape(ctx.getChildren())
return self._unescape(list(ctx.getChildren()))

def _createPrimitive(
self,
Expand Down Expand Up @@ -336,11 +336,11 @@ def _createPrimitive(
raise AssertionError("WS should never be reached")
assert False, symbol.type
# Concatenation of multiple items ==> un-escape the concatenation.
return self._unescape(ctx.getChildren())
return self._unescape(list(ctx.getChildren()))

def _unescape(
self,
seq: Iterable[Union[TerminalNode, OmegaConfGrammarParser.InterpolationContext]],
seq: List[Union[TerminalNode, OmegaConfGrammarParser.InterpolationContext]],
) -> str:
"""
Concatenate all symbols / interpolations in `seq`, unescaping symbols as needed.
Expand All @@ -350,16 +350,43 @@ def _unescape(
resolving of the interpolation).
"""
chrs = []
for node in seq:
for node, next_node in zip_longest(seq, seq[1:]):
if isinstance(node, TerminalNode):
s = node.symbol
if s.type == OmegaConfGrammarLexer.ESC:
chrs.append(s.text[1::2])
elif s.type == OmegaConfGrammarLexer.ESC_INTER:
chrs.append(s.text[1:])
if s.type == OmegaConfGrammarLexer.ESC_INTER:
# `ESC_INTER` is of the form `\\...\${`: the formula below computes
# the number of characters to keep at the end of the string to remove
# the correct number of backslashes.
text = s.text[-(len(s.text) // 2 + 1) :]
elif (
# Character sequence identified as requiring un-escaping.
s.type == OmegaConfGrammarLexer.ESC
or (
# At top level, we need to un-escape backslashes that precede
# an interpolation.
s.type == OmegaConfGrammarLexer.TOP_ESC
and isinstance(
next_node, OmegaConfGrammarParser.InterpolationContext
)
)
or (
# In a quoted sring, we need to un-escape backslashes that
# either end the string, or are followed by an interpolation.
s.type == OmegaConfGrammarLexer.QUOTED_ESC
and (
next_node is None
or isinstance(
next_node, OmegaConfGrammarParser.InterpolationContext
)
)
)
):
text = s.text[1::2] # un-escape the sequence
else:
chrs.append(s.text)
text = s.text # keep the original text
else:
assert isinstance(node, OmegaConfGrammarParser.InterpolationContext)
chrs.append(str(self.visitInterpolation(node)))
text = str(self.visitInterpolation(node))
chrs.append(text)

return "".join(chrs)
26 changes: 19 additions & 7 deletions tests/test_grammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,14 @@
("str_quoted_too_many_3", "''a''", GrammarParseError),
("str_quoted_trailing_esc_1", r"'abc\\'", r" abc\ ".strip()),
("str_quoted_trailing_esc_2", r"'abc\\\\'", r" abc\\ ".strip()),
("str_quoted_no_esc_1", r'"abc\def"', r"abc\def"),
("str_quoted_no_esc_2", r'"abc\\def"', r"abc\def"),
("str_quoted_no_esc_3", r'"\\\abc\def"', r"\\abc\def"),
("str_quoted_no_esc_single_1", r"'abc\def'", r"abc\def"),
("str_quoted_no_esc_single_2", r"'abc\\def'", r"abc\\def"),
("str_quoted_no_esc_single_3", r"'\\\abc\def'", r"\\\abc\def"),
("str_quoted_no_esc_dollar_single", r"'abc\\$$'", r"abc\\$$"),
("str_quoted_no_esc_double_1", r'"abc\def"', r"abc\def"),
("str_quoted_no_esc_double_2", r'"abc\\def"', r"abc\\def"),
("str_quoted_no_esc_double_3", r'"\\\abc\def"', r"\\\abc\def"),
("str_quoted_no_esc_dollar_double", r'"abc\\$$"', r"abc\\$$"),
("str_quoted_bad_1", r'"abc\"', GrammarParseError),
("str_quoted_bad_2", r'"abc\\\"', GrammarParseError),
("str_quoted_esc_quote_single_1", r"'abc\'def'", "abc'def"),
Expand Down Expand Up @@ -255,8 +260,12 @@
("str_quoted_inter", "'${null}'", "None"),
("str_quoted_esc_single_1", r"'ab\'cd\'\'${str}'", "ab'cd''hi"),
("str_quoted_esc_single_2", r"""'\\\${foo}'""", r"\${foo}"),
("str_quoted_esc_single_3", r"""'\\a_${str}'""", r"\\a_hi"),
("str_quoted_esc_single_4", r"""'a_${str}\\'""", r" a_hi\ ".strip()),
("str_quoted_esc_double_1", r'"ab\"cd\"\"${str}"', 'ab"cd""hi'),
("str_quoted_esc_double_2", r'''"\\\${foo}"''', r"\${foo}"),
("str_quoted_esc_double_3", r'''"\\a_${str}"''', r"\\a_hi"),
("str_quoted_esc_double_4", r'''"a_${str}\\"''', r" a_hi\ ".strip()),
("str_quoted_other_quote_double", """'double"'""", 'double"'),
("str_quoted_other_quote_single", '''"single'"''', "single'"),
("str_quoted_concat_bad_1", '"Hi "${str}', GrammarParseError),
Expand Down Expand Up @@ -357,13 +366,16 @@
("str_top_esc_inter", r"Esc: \${str}", "Esc: ${str}"),
("str_top_esc_inter_wrong_1", r"Wrong: $\{str\}", r"Wrong: $\{str\}"),
("str_top_esc_inter_wrong_2", r"Wrong: \${str\}", r"Wrong: ${str\}"),
("str_top_esc_backslash", r"Esc: \\${str}", r"Esc: \hi"),
("str_top_esc_backslash_1", r"Esc: \\${str}", r"Esc: \hi"),
("str_top_esc_backslash_2", r"Esc: \\\\${str}", r"Esc: \\hi"),
("str_top_quoted_braces_wrong", r"Wrong: \{${str}\}", r"Wrong: \{hi\}"),
("str_top_leading_dollars", r"$$${str}", "$$hi"),
("str_top_trailing_dollars", r"${str}$$$$", "hi$$$$"),
("str_top_leading_escapes", r"\\\\\${str}", r"\\${str}"),
("str_top_middle_escapes", r"abc\\\\\${str}", r"abc\\${str}"),
("str_top_trailing_escapes", "${str}" + "\\" * 5, "hi" + "\\" * 3),
("str_top_leading_escapes_1", r"\\\\\${str}", r"\\${str}"),
("str_top_leading_escapes_2", r"\\\\ \${str}", r"\\\\ ${str}"),
("str_top_middle_escapes_1", r"abc\\\\\${str}", r"abc\\${str}"),
("str_top_middle_escapes_2", r"abc\\\\ \${str}", r"abc\\\\ ${str}"),
("str_top_trailing_escapes", r" ${str}\\\ ".strip(), r" hi\\\ ".strip()),
("str_top_concat_interpolations", "${null}${float}", "None1.2"),
("str_top_issue_617", r""" ${test: "hi\\" }"} """, r" hi\"} "),
# Whitespaces.
Expand Down

0 comments on commit e4681ef

Please sign in to comment.