Skip to content

Commit

Permalink
Simpler escaping in the grammar, and fix to quoted values
Browse files Browse the repository at this point in the history
* In top-level strings, the only backslashes that can and should be
  escaped are those preceding an interpolation. For instance, if "x"
  is a node evaluating to "X":
        \${x}   -> ${x}
        \\${x}  -> \X
        \\\${x} -> \${x}
        ${x}\\  -> X\\

* In quoted strings, the only backslashes that can and should be escaped
  are those preceding a quote of the same type as the enclosing quotes.
  For instance:
        "abc\"def" -> abc"def
        "abc\\" -> abc\
        "abc\\\"def" -> abc\"def
        "abc\\\'def" -> abc\\\'def

This also fixes a bug with the parsing of quoted strings (see omry#617).

Fixes omry#615
Fixes omry#617
  • Loading branch information
odelalleau committed Mar 18, 2021
1 parent 074e8dc commit 37f0a5f
Show file tree
Hide file tree
Showing 4 changed files with 157 additions and 33 deletions.
20 changes: 16 additions & 4 deletions omegaconf/grammar/OmegaConfGrammarLexer.g4
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ fragment ESC_BACKSLASH: '\\\\'; // escaped backslash

TOP_INTER_OPEN: INTER_OPEN -> type(INTER_OPEN), pushMode(INTERPOLATION_MODE);

ESC_INTER: '\\${';
TOP_ESC: ESC_BACKSLASH+ -> type(ESC);
ESC_INTER: ('\\\\')* '\\${'; // \${, \\\${, \\\\\${, ...
EVEN_BACKSLASHES: ESC_BACKSLASH+; // \\, \\\\, \\\\\\, ...

// The backslash and dollar characters must not be grouped with others, so that
// we can properly detect the tokens above.
Expand Down Expand Up @@ -60,9 +60,21 @@ ESC: (ESC_BACKSLASH | '\\(' | '\\)' | '\\[' | '\\]' | '\\{' | '\\}' |
'\\:' | '\\=' | '\\,' | '\\ ' | '\\\t')+;
WS: [ \t]+;

// Quoted values for both types of quotes.
// A quoted value is made of the enclosing quotes, and either:
// - nothing else
// - an even number of backslashes (meaning they are escaped)
// - a sequence of any character, followed by any non-backslash character, and optionally
// an even number of backslashes (i.e., also escaped)
// Examples (right hand side: expected content of the resulting string, after processing):
// "" -> <empty>
// '\\' -> \
// "\\\\" -> \\
// 'abc\\' -> abc\
// "abc\"def\\\'ghi\\\\" -> abc"def\\\'ghi\\
QUOTED_VALUE:
'\'' ('\\\''|.)*? '\'' // Single quotes, can contain escaped single quote : /'
| '"' ('\\"'|.)*? '"' ; // Double quotes, can contain escaped double quote : /"
'"' (('\\\\')* | (.)*? ~[\\] ('\\\\')*) '"' // double quotes
| '\'' (('\\\\')* | (.)*? ~[\\] ('\\\\')*) '\''; // single quotes

////////////////////////
// INTERPOLATION_MODE //
Expand Down
2 changes: 1 addition & 1 deletion omegaconf/grammar/OmegaConfGrammarParser.g4
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ configValue: (toplevelStr | (toplevelStr? (interpolation toplevelStr?)+)) EOF;
singleElement: element EOF;

// Top-level string (that does not need to be parsed).
toplevelStr: (ESC | ESC_INTER | TOP_CHAR | TOP_STR)+;
toplevelStr: (EVEN_BACKSLASHES | ESC_INTER | TOP_CHAR | TOP_STR)+;

// Elements.

Expand Down
134 changes: 113 additions & 21 deletions omegaconf/grammar_visitor.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
import sys
import warnings
from typing import (
Expand Down Expand Up @@ -34,6 +35,16 @@
)
sys.exit(1)

# Regex matching trailing backslashes.
TRAILING_BACKSLASHES = re.compile("(\\\\)+$")

# Regex matching escaped quotes, for each kind of quote.
# Note that we include *all* backslashes preceding the quote.
ESCAPED_QUOTE = {
"'": re.compile(r"(\\)+'"), # escaped single quote
'"': re.compile(r'(\\)+"'), # escaped double quote
}


class GrammarVisitor(OmegaConfGrammarParserVisitor):
def __init__(
Expand Down Expand Up @@ -100,14 +111,33 @@ def visitConfigValue(self, ctx: OmegaConfGrammarParser.ConfigValueContext) -> An
# (toplevelStr | (toplevelStr? (interpolation toplevelStr?)+)) EOF
# Visit all children (except last one which is EOF)
vals = [self.visit(c) for c in list(ctx.getChildren())[:-1]]
assert vals
if len(vals) == 1 and isinstance(
n_vals = len(vals)
assert n_vals > 0

if n_vals == 1 and isinstance(
ctx.getChild(0), OmegaConfGrammarParser.InterpolationContext
):
# Single interpolation: return the result "as is".
return vals[0]

# Concatenation of multiple components.
return "".join(map(str, vals))
# When a top-level string is followed by an interpolation, we need to un-escape
# any trailing backslash.
tokens = []
for i, (child, val) in enumerate(zip(ctx.getChildren(), vals)):
if isinstance(child, OmegaConfGrammarParser.ToplevelStrContext):
if i < n_vals - 1:
# Top-level string followed by an interpolation.
assert isinstance(
ctx.getChild(i + 1), OmegaConfGrammarParser.InterpolationContext
)
tokens.append(self._unescape_trailing_backslashes(val))
else:
tokens.append(val)
else:
tokens.append(str(val))

return "".join(tokens)

def visitDictKey(self, ctx: OmegaConfGrammarParser.DictKeyContext) -> Any:
return self._createPrimitive(ctx)
Expand Down Expand Up @@ -282,8 +312,19 @@ def visitSingleElement(
return self.visit(ctx.getChild(0))

def visitToplevelStr(self, ctx: OmegaConfGrammarParser.ToplevelStrContext) -> str:
# (ESC | ESC_INTER | TOP_CHAR | TOP_STR)+
return self._unescape(ctx.getChildren())
# (EVEN_BACKSLASHES | ESC_INTER | TOP_CHAR | TOP_STR)+

# Concatenate all fragments, un-escaping interpolations.
tokens = []
for child in ctx.getChildren():
txt = child.getText()
if child.symbol.type == OmegaConfGrammarLexer.ESC_INTER:
# Un-escape the interpolation, e.g. \${ -> ${ or \\\${ -> \${
assert len(txt) % 2 == 1
txt = txt[-(len(txt) + 1) // 2 :]
tokens.append(txt)

return "".join(tokens)

def _createPrimitive(
self,
Expand Down Expand Up @@ -318,13 +359,13 @@ def _createPrimitive(
elif symbol.type == OmegaConfGrammarLexer.BOOL:
return symbol.text.lower() == "true"
elif symbol.type == OmegaConfGrammarLexer.ESC:
return self._unescape([child])
return self._unescape_from_sequence([child])
elif symbol.type == OmegaConfGrammarLexer.WS: # pragma: no cover
# A single WS should have been "consumed" by another token.
raise AssertionError("WS should never be reached")
assert False, symbol.type
# Concatenation of multiple items ==> un-escape the concatenation.
return self._unescape(ctx.getChildren())
return self._unescape_from_sequence(ctx.getChildren())

def _resolve_quoted_string(self, quoted: str) -> str:
"""
Expand All @@ -333,23 +374,18 @@ def _resolve_quoted_string(self, quoted: str) -> str:
# Identify quote type.
assert len(quoted) >= 2 and quoted[0] == quoted[-1]
quote_type = quoted[0]
assert quote_type in ["'", '"']

# Un-escape quotes and backslashes within the string (the two kinds of
# escapable characters in quoted strings). We do it in two passes:
# 1. Replace `\"` with `"` (and similarly for single quotes)
# 2. Replace `\\` with `\`
# The order is important so that `\\"` is replaced with an escaped quote `\"`.
# We also remove the start and end quotes.
esc_quote = f"\\{quote_type}"
quoted_content = (
quoted[1:-1].replace(esc_quote, quote_type).replace("\\\\", "\\")
)

# Remove enclosing quotes.
content = quoted[1:-1]

# Un-escape quotes then trailing backslashes.
content = self._unescape_quotes(content, quote_type)
content = self._unescape_trailing_backslashes(content)

# Parse the string.
return self.quoted_string_callback(quoted_content)
return self.quoted_string_callback(content)

def _unescape(
def _unescape_from_sequence(
self,
seq: Iterable[Union[TerminalNode, OmegaConfGrammarParser.InterpolationContext]],
) -> str:
Expand All @@ -374,3 +410,59 @@ def _unescape(
assert isinstance(node, OmegaConfGrammarParser.InterpolationContext)
chrs.append(str(self.visitInterpolation(node)))
return "".join(chrs)

def _unescape_quotes(self, expr: str, quote_type: str) -> str:
"""
Un-escape escaped quotes from an expression.
Examples:
abc -> abc
abc\'def -> abc'def
abc\'\'def\\\'gh -> abc''def\'gh
"""
pattern = ESCAPED_QUOTE[quote_type]
match = pattern.search(expr)

if match is None:
return expr

tokens = []
while match is not None:
start, stop = match.span()
size = stop - start
# An escaped quote is made of an odd number of backslashes followed by a
# quote, so the total number of matching characters should be even.
assert size % 2 == 0
# Add characters before the escaped quote.
tokens.append(expr[0:start])
# Add the escaped quote (un-escaping the backslashes, which can be achieved
# by extracting the proper subset of the string).
tokens.append(expr[stop - size // 2 : stop])
# Move on to next match.
expr = expr[stop:]
match = pattern.search(expr)

# Add characters after the last match.
tokens.append(expr)

return "".join(tokens)

def _unescape_trailing_backslashes(self, expr: str) -> str:
"""
Un-escape trailing backslashes from `expr`.
Examples:
abc -> abc
abc\\ -> abc\
abc\\def -> abc\\def
abc\\def\\\\ -> abc\\def\\
"""
match = TRAILING_BACKSLASHES.search(expr)
if match is None:
return expr
start, end = match.span()
n_backslashes = end - start
# Sanity check: there should be an even number of backslashes at end of string.
assert n_backslashes % 2 == 0
# Un-escaping backslashes <=> removing half of them.
return expr[0 : start + n_backslashes // 2]
34 changes: 27 additions & 7 deletions tests/test_grammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,23 @@
("str_quoted_too_many_1", "''a'", GrammarParseError),
("str_quoted_too_many_2", "'a''", GrammarParseError),
("str_quoted_too_many_3", "''a''", GrammarParseError),
("str_quoted_trailing_esc_1", "'abc\\\\'", "abc\\"),
("str_quoted_trailing_esc_2", "'abc\\\\\\\\'", "abc\\\\"),
("str_quoted_no_esc_1", '"abc\\def"', "abc\\def"),
("str_quoted_no_esc_2", '"abc\\\\def"', "abc\\\\def"),
("str_quoted_no_esc_3", '"\\\\\\abc\\def"', "\\\\\\abc\\def"),
("str_quoted_bad_1", '"abc\\"', GrammarParseError),
("str_quoted_bad_2", '"abc\\\\\\"', GrammarParseError),
("str_quoted_esc_quote_single_1", "'abc\\'def'", "abc'def"),
("str_quoted_esc_quote_single_2", "'abc\\\\\\'def'", "abc\\'def"),
("str_quoted_esc_quote_single_3", "'abc\\\\\\\\\\'def'", "abc\\\\'def"),
("str_quoted_esc_quote_single_4", "'a\\'b\\'cdef\\\\\\''", "a'b'cdef\\'"),
("str_quoted_esc_quote_single_bad", "'abc\\\\'def'", GrammarParseError),
("str_quoted_esc_quote_double_1", '"abc\\"def"', 'abc"def'),
("str_quoted_esc_quote_double_2", '"abc\\\\\\"def"', 'abc\\"def'),
("str_quoted_esc_quote_double_3", '"abc\\\\\\\\\\"def"', 'abc\\\\"def'),
("str_quoted_esc_quote_double_4", '"a\\"b\\"cdef\\\\\\""', 'a"b"cdef\\"'),
("str_quoted_esc_quote_double_bad", '"abc\\\\"def"', GrammarParseError),
# Lists and dictionaries.
("list", "[0, 1]", [0, 1]),
(
Expand Down Expand Up @@ -218,9 +235,9 @@
# Interpolations in quoted strings.
("str_quoted_inter", "'${null}'", "None"),
("str_quoted_esc_single_1", r"'ab\'cd\'\'${str}'", "ab'cd''hi"),
("str_quoted_esc_single_2", "'\"\\\\\\\\\\${foo}\\ '", r'"\${foo}\ '),
("str_quoted_esc_single_2", "'\"\\\\\\${foo}\\ '", r'"\${foo}\ '),
("str_quoted_esc_double_1", r'"ab\"cd\"\"${str}"', 'ab"cd""hi'),
("str_quoted_esc_double_2", '"\'\\\\\\\\\\${foo}\\ "', r"'\${foo}\ "),
("str_quoted_esc_double_2", '"\'\\\\\\${foo}\\ "', r"'\${foo}\ "),
("str_quoted_concat_bad_1", '"Hi "${str}', GrammarParseError),
# Whitespaces.
("ws_inter_node_outer", "${ \tdict.a \t}", 0),
Expand All @@ -240,7 +257,7 @@
("nested_simple", "${${ref_str}}", "hi"),
("nested_select", "${options.${choice}}", "A"),
("nested_relative", "${${rel_opt}.b}", "B"),
("str_quoted_nested", r"'AB${test:\'CD${test:\\'EF\\'}GH\'}'", "ABCDEFGH"),
("str_quoted_nested", r"'AB${test:\'CD${test:\\\'EF\\\'}GH\'}'", "ABCDEFGH"),
# Resolver interpolations.
("no_args", "${test:}", []),
("space_in_args", "${test:a, b c}", ["a", "b c"]),
Expand Down Expand Up @@ -272,8 +289,7 @@
("dict_nan_key_2", "${first:{${test:nan}: 0}}", GrammarParseError),
]

# Parameters for tests of the "configValue" rule (may contain node
# interpolations, but no resolvers).
# Parameters for tests of the "configValue" rule (may contain interpolations).
PARAMS_CONFIG_VALUE = [
# String interpolations (top-level).
("str_top_basic", "bonjour ${str}", "bonjour hi"),
Expand All @@ -298,7 +314,9 @@
("str_top_middle_quotes_single", "I like '${str}'", "I like 'hi'"),
("str_top_middle_quotes_double", 'I like "${str}"', 'I like "hi"'),
("str_top_any_char", "${str} !@\\#$%^&*})][({,/?;", "hi !@\\#$%^&*})][({,/?;"),
("str_top_esc_inter", r"Esc: \${str}", "Esc: ${str}"),
("str_top_esc_inter_1", r"Esc: \${str}", "Esc: ${str}"),
("str_top_esc_inter_2", r"Esc: \\\${str}", r"Esc: \${str}"),
("str_top_esc_inter_3", r"abc\\${str}\${str}\\", r"abc\hi${str}\\"),
("str_top_esc_inter_wrong_1", r"Wrong: $\{str\}", r"Wrong: $\{str\}"),
("str_top_esc_inter_wrong_2", r"Wrong: \${str\}", r"Wrong: ${str\}"),
("str_top_esc_backslash", r"Esc: \\${str}", r"Esc: \hi"),
Expand All @@ -307,8 +325,9 @@
("str_top_trailing_dollars", r"${str}$$$$", "hi$$$$"),
("str_top_leading_escapes", r"\\\\\${str}", r"\\${str}"),
("str_top_middle_escapes", r"abc\\\\\${str}", r"abc\\${str}"),
("str_top_trailing_escapes", "${str}" + "\\" * 5, "hi" + "\\" * 3),
("str_top_trailing_escapes", "${str}" + "\\" * 5, "hi" + "\\" * 5),
("str_top_concat_interpolations", "${null}${float}", "None1.2"),
("str_top_issue_617", r""" ${test: "hi\\" }"} """, ' hi\\"} '),
# Whitespaces.
("ws_toplevel", " \tab ${str} cd ${int}\t", " \tab hi cd 123\t"),
# Unmatched braces.
Expand Down Expand Up @@ -376,6 +395,7 @@ def test_config_value(
self, restore_resolvers: Any, definition: str, expected: Any
) -> None:
parse_tree, expected_visit = self._parse("configValue", definition, expected)
OmegaConf.register_new_resolver("test", self._resolver_test)
self._visit_with_config(parse_tree, expected_visit)

@parametrize_from(
Expand Down

0 comments on commit 37f0a5f

Please sign in to comment.