Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Grammar: only un-escape \\ in situations where escaping is required #708

Merged
merged 4 commits into from
May 7, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 29 additions & 13 deletions omegaconf/grammar/OmegaConfGrammarLexer.g4
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,22 @@ fragment ESC_BACKSLASH: '\\\\'; // escaped backslash

TOP_INTER_OPEN: INTER_OPEN -> type(INTER_OPEN), pushMode(INTERPOLATION_MODE);

ESC_INTER: '\\${';
TOP_ESC: ESC_BACKSLASH+ -> type(ESC);
// Regular string: anything that does not contain any $ and does not end with \
// (this ensures this rule will not consume characters required to recognize other tokens).
ANY_STR: ~[$]* ~[\\$];
omry marked this conversation as resolved.
Show resolved Hide resolved

// Escaped interpolation: '\${', optionally preceded by an even number of \
ESC_INTER: ESC_BACKSLASH* '\\${';

// Backslashes that *may* be escaped (even number).
TOP_ESC: ESC_BACKSLASH+;

// Other backslashes that will not need escaping (odd number due to not matching the previous rule).
BACKSLASHES: '\\'+ -> type(ANY_STR);

// The dollar sign must be singled out so that we can recognize interpolations.
DOLLAR: '$' -> type(ANY_STR);

// The backslash and dollar characters must not be grouped with others, so that
// we can properly detect the tokens above.
SPECIAL_CHAR: [\\$];
ANY_STR: ~[\\$]+; // anything else

////////////////
// VALUE_MODE //
Expand Down Expand Up @@ -96,11 +105,17 @@ mode QUOTED_SINGLE_MODE;
QSINGLE_INTER_OPEN: INTER_OPEN -> type(INTER_OPEN), pushMode(INTERPOLATION_MODE);
MATCHING_QUOTE_CLOSE: '\'' -> popMode;

QSINGLE_ESC: ('\\\'' | ESC_BACKSLASH)+ -> type(ESC);
// Regular string: anything that does not contain any $ *or quote* and does not end with \
QSINGLE_STR: ~['$]* ~['\\$] -> type(ANY_STR);

QSINGLE_ESC_INTER: ESC_INTER -> type(ESC_INTER);

QSINGLE_SPECIAL_CHAR: SPECIAL_CHAR -> type(SPECIAL_CHAR);
QSINGLE_STR: ~['\\$]+ -> type(ANY_STR);
// Escaped quote (optionally preceded by an even number of backslashes).
QSINGLE_ESC_QUOTE: ESC_BACKSLASH* '\\\'' -> type(ESC);

QUOTED_ESC: ESC_BACKSLASH+;
QSINGLE_BACKSLASHES: '\\'+ -> type(ANY_STR);
QSINGLE_DOLLAR: '$' -> type(ANY_STR);


////////////////////////
Expand All @@ -114,8 +129,9 @@ mode QUOTED_DOUBLE_MODE;
QDOUBLE_INTER_OPEN: INTER_OPEN -> type(INTER_OPEN), pushMode(INTERPOLATION_MODE);
QDOUBLE_CLOSE: '"' -> type(MATCHING_QUOTE_CLOSE), popMode;

QDOUBLE_ESC: ('\\"' | ESC_BACKSLASH)+ -> type(ESC);
QDOUBLE_STR: ~["$]* ~["\\$] -> type(ANY_STR);
omry marked this conversation as resolved.
Show resolved Hide resolved
QDOUBLE_ESC_INTER: ESC_INTER -> type(ESC_INTER);

QDOUBLE_SPECIAL_CHAR: SPECIAL_CHAR -> type(SPECIAL_CHAR);
QDOUBLE_STR: ~["\\$]+ -> type(ANY_STR);
QDOUBLE_ESC_QUOTE: ESC_BACKSLASH* '\\"' -> type(ESC);
QDOUBLE_ESC: ESC_BACKSLASH+ -> type(QUOTED_ESC);
QDOUBLE_BACKSLASHES: '\\'+ -> type(ANY_STR);
QDOUBLE_DOLLAR: '$' -> type(ANY_STR);
2 changes: 1 addition & 1 deletion omegaconf/grammar/OmegaConfGrammarParser.g4
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ singleElement: element EOF;

// Composite text expression (may contain interpolations).

text: (interpolation | ESC | ESC_INTER | SPECIAL_CHAR | ANY_STR)+;
text: (interpolation | ANY_STR | ESC | ESC_INTER | TOP_ESC | QUOTED_ESC)+;
omry marked this conversation as resolved.
Show resolved Hide resolved


// Elements.
Expand Down
51 changes: 39 additions & 12 deletions omegaconf/grammar_visitor.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import sys
import warnings
from itertools import zip_longest
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Generator,
Iterable,
List,
Optional,
Set,
Expand Down Expand Up @@ -289,7 +289,7 @@ def visitSingleElement(
return self.visit(ctx.getChild(0))

def visitText(self, ctx: OmegaConfGrammarParser.TextContext) -> Any:
# (interpolation | ESC | ESC_INTER | SPECIAL_CHAR | ANY_STR)+
# (interpolation | ANY_STR | ESC | ESC_INTER | TOP_ESC | QUOTED_ESC)+

# Single interpolation? If yes, return its resolved value "as is".
if ctx.getChildCount() == 1:
Expand All @@ -298,7 +298,7 @@ def visitText(self, ctx: OmegaConfGrammarParser.TextContext) -> Any:
return self.visitInterpolation(c)

# Otherwise, concatenate string representations together.
return self._unescape(ctx.getChildren())
return self._unescape(list(ctx.getChildren()))

def _createPrimitive(
self,
Expand Down Expand Up @@ -336,11 +336,11 @@ def _createPrimitive(
raise AssertionError("WS should never be reached")
assert False, symbol.type
# Concatenation of multiple items ==> un-escape the concatenation.
return self._unescape(ctx.getChildren())
return self._unescape(list(ctx.getChildren()))

def _unescape(
self,
seq: Iterable[Union[TerminalNode, OmegaConfGrammarParser.InterpolationContext]],
seq: List[Union[TerminalNode, OmegaConfGrammarParser.InterpolationContext]],
) -> str:
"""
Concatenate all symbols / interpolations in `seq`, unescaping symbols as needed.
Expand All @@ -350,16 +350,43 @@ def _unescape(
resolving of the interpolation).
"""
chrs = []
for node in seq:
for node, next_node in zip_longest(seq, seq[1:]):
omry marked this conversation as resolved.
Show resolved Hide resolved
if isinstance(node, TerminalNode):
s = node.symbol
if s.type == OmegaConfGrammarLexer.ESC:
chrs.append(s.text[1::2])
elif s.type == OmegaConfGrammarLexer.ESC_INTER:
chrs.append(s.text[1:])
if s.type == OmegaConfGrammarLexer.ESC_INTER:
# `ESC_INTER` is of the form `\\...\${`: the formula below computes
# the number of characters to keep at the end of the string to remove
# the correct number of backslashes.
text = s.text[-(len(s.text) // 2 + 1) :]
elif (
# Character sequence identified as requiring un-escaping.
s.type == OmegaConfGrammarLexer.ESC
or (
# At top level, we need to un-escape backslashes that precede
# an interpolation.
s.type == OmegaConfGrammarLexer.TOP_ESC
and isinstance(
next_node, OmegaConfGrammarParser.InterpolationContext
)
)
or (
# In a quoted sring, we need to un-escape backslashes that
# either end the string, or are followed by an interpolation.
s.type == OmegaConfGrammarLexer.QUOTED_ESC
and (
next_node is None
or isinstance(
next_node, OmegaConfGrammarParser.InterpolationContext
)
)
)
):
text = s.text[1::2] # un-escape the sequence
else:
chrs.append(s.text)
text = s.text # keep the original text
else:
assert isinstance(node, OmegaConfGrammarParser.InterpolationContext)
chrs.append(str(self.visitInterpolation(node)))
text = str(self.visitInterpolation(node))
chrs.append(text)

return "".join(chrs)
26 changes: 19 additions & 7 deletions tests/test_grammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,14 @@
("str_quoted_too_many_3", "''a''", GrammarParseError),
("str_quoted_trailing_esc_1", r"'abc\\'", r" abc\ ".strip()),
("str_quoted_trailing_esc_2", r"'abc\\\\'", r" abc\\ ".strip()),
("str_quoted_no_esc_1", r'"abc\def"', r"abc\def"),
("str_quoted_no_esc_2", r'"abc\\def"', r"abc\def"),
("str_quoted_no_esc_3", r'"\\\abc\def"', r"\\abc\def"),
("str_quoted_no_esc_single_1", r"'abc\def'", r"abc\def"),
("str_quoted_no_esc_single_2", r"'abc\\def'", r"abc\\def"),
("str_quoted_no_esc_single_3", r"'\\\abc\def'", r"\\\abc\def"),
("str_quoted_no_esc_dollar_single", r"'abc\\$$'", r"abc\\$$"),
("str_quoted_no_esc_double_1", r'"abc\def"', r"abc\def"),
("str_quoted_no_esc_double_2", r'"abc\\def"', r"abc\\def"),
("str_quoted_no_esc_double_3", r'"\\\abc\def"', r"\\\abc\def"),
("str_quoted_no_esc_dollar_double", r'"abc\\$$"', r"abc\\$$"),
("str_quoted_bad_1", r'"abc\"', GrammarParseError),
("str_quoted_bad_2", r'"abc\\\"', GrammarParseError),
("str_quoted_esc_quote_single_1", r"'abc\'def'", "abc'def"),
Expand Down Expand Up @@ -255,8 +260,12 @@
("str_quoted_inter", "'${null}'", "None"),
("str_quoted_esc_single_1", r"'ab\'cd\'\'${str}'", "ab'cd''hi"),
("str_quoted_esc_single_2", r"""'\\\${foo}'""", r"\${foo}"),
("str_quoted_esc_single_3", r"""'\\a_${str}'""", r"\\a_hi"),
("str_quoted_esc_single_4", r"""'a_${str}\\'""", r" a_hi\ ".strip()),
("str_quoted_esc_double_1", r'"ab\"cd\"\"${str}"', 'ab"cd""hi'),
("str_quoted_esc_double_2", r'''"\\\${foo}"''', r"\${foo}"),
("str_quoted_esc_double_3", r'''"\\a_${str}"''', r"\\a_hi"),
("str_quoted_esc_double_4", r'''"a_${str}\\"''', r" a_hi\ ".strip()),
("str_quoted_other_quote_double", """'double"'""", 'double"'),
("str_quoted_other_quote_single", '''"single'"''', "single'"),
("str_quoted_concat_bad_1", '"Hi "${str}', GrammarParseError),
Expand Down Expand Up @@ -357,13 +366,16 @@
("str_top_esc_inter", r"Esc: \${str}", "Esc: ${str}"),
("str_top_esc_inter_wrong_1", r"Wrong: $\{str\}", r"Wrong: $\{str\}"),
("str_top_esc_inter_wrong_2", r"Wrong: \${str\}", r"Wrong: ${str\}"),
("str_top_esc_backslash", r"Esc: \\${str}", r"Esc: \hi"),
("str_top_esc_backslash_1", r"Esc: \\${str}", r"Esc: \hi"),
("str_top_esc_backslash_2", r"Esc: \\\\${str}", r"Esc: \\hi"),
("str_top_quoted_braces_wrong", r"Wrong: \{${str}\}", r"Wrong: \{hi\}"),
("str_top_leading_dollars", r"$$${str}", "$$hi"),
("str_top_trailing_dollars", r"${str}$$$$", "hi$$$$"),
("str_top_leading_escapes", r"\\\\\${str}", r"\\${str}"),
("str_top_middle_escapes", r"abc\\\\\${str}", r"abc\\${str}"),
("str_top_trailing_escapes", "${str}" + "\\" * 5, "hi" + "\\" * 3),
("str_top_leading_escapes_1", r"\\\\\${str}", r"\\${str}"),
("str_top_leading_escapes_2", r"\\\\ \${str}", r"\\\\ ${str}"),
("str_top_middle_escapes_1", r"abc\\\\\${str}", r"abc\\${str}"),
("str_top_middle_escapes_2", r"abc\\\\ \${str}", r"abc\\\\ ${str}"),
("str_top_trailing_escapes", r" ${str}\\\ ".strip(), r" hi\\\ ".strip()),
("str_top_concat_interpolations", "${null}${float}", "None1.2"),
("str_top_issue_617", r""" ${test: "hi\\" }"} """, r" hi\"} "),
# Whitespaces.
Expand Down