Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ON HOLD] Simpler escaping in the grammar, and fix to quoted values #621

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions omegaconf/grammar/OmegaConfGrammarLexer.g4
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ fragment ESC_BACKSLASH: '\\\\'; // escaped backslash

TOP_INTER_OPEN: INTER_OPEN -> type(INTER_OPEN), pushMode(INTERPOLATION_MODE);

ESC_INTER: '\\${';
TOP_ESC: ESC_BACKSLASH+ -> type(ESC);
ESC_INTER: ('\\\\')* '\\${'; // \${, \\\${, \\\\\${, ...
EVEN_BACKSLASHES: ESC_BACKSLASH+; // \\, \\\\, \\\\\\, ...

// The backslash and dollar characters must not be grouped with others, so that
// we can properly detect the tokens above.
Expand Down Expand Up @@ -60,9 +60,21 @@ ESC: (ESC_BACKSLASH | '\\(' | '\\)' | '\\[' | '\\]' | '\\{' | '\\}' |
'\\:' | '\\=' | '\\,' | '\\ ' | '\\\t')+;
WS: [ \t]+;

// Quoted values for both types of quotes.
// A quoted value is made of the enclosing quotes, and either:
// - nothing else
// - an even number of backslashes (meaning they are escaped)
// - a sequence of any character, followed by any non-backslash character, and optionally
// an even number of backslashes (i.e., also escaped)
// Examples (right hand side: expected content of the resulting string, after processing):
// "" -> <empty>
// '\\' -> \
// "\\\\" -> \\
// 'abc\\' -> abc\
// "abc\"def\\\'ghi\\\\" -> abc"def\\\'ghi\\
QUOTED_VALUE:
'\'' ('\\\''|.)*? '\'' // Single quotes, can contain escaped single quote : /'
| '"' ('\\"'|.)*? '"' ; // Double quotes, can contain escaped double quote : /"
'"' (('\\\\')* | (.)*? ~[\\] ('\\\\')*) '"' // double quotes
| '\'' (('\\\\')* | (.)*? ~[\\] ('\\\\')*) '\''; // single quotes

////////////////////////
// INTERPOLATION_MODE //
Expand Down
2 changes: 1 addition & 1 deletion omegaconf/grammar/OmegaConfGrammarParser.g4
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ configValue: (toplevelStr | (toplevelStr? (interpolation toplevelStr?)+)) EOF;
singleElement: element EOF;

// Top-level string (that does not need to be parsed).
toplevelStr: (ESC | ESC_INTER | TOP_CHAR | TOP_STR)+;
toplevelStr: (EVEN_BACKSLASHES | ESC_INTER | TOP_CHAR | TOP_STR)+;

// Elements.

Expand Down
136 changes: 114 additions & 22 deletions omegaconf/grammar_visitor.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
import sys
import warnings
from typing import (
Expand Down Expand Up @@ -34,6 +35,16 @@
)
sys.exit(1)

# Regex matching trailing backslashes.
TRAILING_BACKSLASHES = re.compile("(\\\\)+$")

# Regex matching escaped quotes, for each kind of quote.
# Note that we include *all* backslashes preceding the quote.
ESCAPED_QUOTE = {
"'": re.compile(r"(\\)+'"), # escaped single quote
'"': re.compile(r'(\\)+"'), # escaped double quote
}


class GrammarVisitor(OmegaConfGrammarParserVisitor):
def __init__(
Expand Down Expand Up @@ -100,14 +111,33 @@ def visitConfigValue(self, ctx: OmegaConfGrammarParser.ConfigValueContext) -> An
# (toplevelStr | (toplevelStr? (interpolation toplevelStr?)+)) EOF
# Visit all children (except last one which is EOF)
vals = [self.visit(c) for c in list(ctx.getChildren())[:-1]]
assert vals
if len(vals) == 1 and isinstance(
n_vals = len(vals)
assert n_vals > 0

if n_vals == 1 and isinstance(
ctx.getChild(0), OmegaConfGrammarParser.InterpolationContext
):
# Single interpolation: return the result "as is".
return vals[0]

# Concatenation of multiple components.
return "".join(map(str, vals))
# When a top-level string is followed by an interpolation, we need to un-escape
# any trailing backslash.
tokens = []
for i, (child, val) in enumerate(zip(ctx.getChildren(), vals)):
if isinstance(child, OmegaConfGrammarParser.ToplevelStrContext):
if i < n_vals - 1:
# Top-level string followed by an interpolation.
assert isinstance(
ctx.getChild(i + 1), OmegaConfGrammarParser.InterpolationContext
)
tokens.append(self._unescape_trailing_backslashes(val))
else:
tokens.append(val)
else:
tokens.append(str(val))

return "".join(tokens)

def visitDictKey(self, ctx: OmegaConfGrammarParser.DictKeyContext) -> Any:
return self._createPrimitive(ctx)
Expand Down Expand Up @@ -282,8 +312,19 @@ def visitSingleElement(
return self.visit(ctx.getChild(0))

def visitToplevelStr(self, ctx: OmegaConfGrammarParser.ToplevelStrContext) -> str:
# (ESC | ESC_INTER | TOP_CHAR | TOP_STR)+
return self._unescape(ctx.getChildren())
# (EVEN_BACKSLASHES | ESC_INTER | TOP_CHAR | TOP_STR)+

# Concatenate all fragments, un-escaping interpolations.
tokens = []
for child in ctx.getChildren():
txt = child.getText()
if child.symbol.type == OmegaConfGrammarLexer.ESC_INTER:
# Un-escape the interpolation, e.g. \${ -> ${ or \\\${ -> \${
assert len(txt) % 2 == 1
txt = txt[-(len(txt) + 1) // 2 :]
tokens.append(txt)

return "".join(tokens)

def _createPrimitive(
self,
Expand Down Expand Up @@ -318,13 +359,13 @@ def _createPrimitive(
elif symbol.type == OmegaConfGrammarLexer.BOOL:
return symbol.text.lower() == "true"
elif symbol.type == OmegaConfGrammarLexer.ESC:
return self._unescape([child])
return self._unescape_from_sequence([child])
elif symbol.type == OmegaConfGrammarLexer.WS: # pragma: no cover
# A single WS should have been "consumed" by another token.
raise AssertionError("WS should never be reached")
assert False, symbol.type
# Concatenation of multiple items ==> un-escape the concatenation.
return self._unescape(ctx.getChildren())
return self._unescape_from_sequence(ctx.getChildren())

def _resolve_quoted_string(self, quoted: str) -> str:
"""
Expand All @@ -333,23 +374,18 @@ def _resolve_quoted_string(self, quoted: str) -> str:
# Identify quote type.
assert len(quoted) >= 2 and quoted[0] == quoted[-1]
quote_type = quoted[0]
assert quote_type in ["'", '"']

# Un-escape quotes and backslashes within the string (the two kinds of
# escapable characters in quoted strings). We do it in two passes:
# 1. Replace `\"` with `"` (and similarly for single quotes)
# 2. Replace `\\` with `\`
# The order is important so that `\\"` is replaced with an escaped quote `\"`.
# We also remove the start and end quotes.
esc_quote = f"\\{quote_type}"
quoted_content = (
quoted[1:-1].replace(esc_quote, quote_type).replace("\\\\", "\\")
)

# Remove enclosing quotes.
content = quoted[1:-1]

# Un-escape quotes then trailing backslashes.
content = self._unescape_quotes(content, quote_type)
content = self._unescape_trailing_backslashes(content)

# Parse the string.
return self.quoted_string_callback(quoted_content)
return self.quoted_string_callback(content)

def _unescape(
def _unescape_from_sequence(
self,
seq: Iterable[Union[TerminalNode, OmegaConfGrammarParser.InterpolationContext]],
) -> str:
Expand All @@ -367,10 +403,66 @@ def _unescape(
if s.type == OmegaConfGrammarLexer.ESC:
chrs.append(s.text[1::2])
elif s.type == OmegaConfGrammarLexer.ESC_INTER:
chrs.append(s.text[1:])
assert False # escaped interpolations are handled elsewhere
else:
chrs.append(s.text)
else:
assert isinstance(node, OmegaConfGrammarParser.InterpolationContext)
chrs.append(str(self.visitInterpolation(node)))
return "".join(chrs)

def _unescape_quotes(self, expr: str, quote_type: str) -> str:
"""
Un-escape escaped quotes from an expression.

Examples:
abc -> abc
abc\'def -> abc'def
abc\'\'def\\\'gh -> abc''def\'gh
"""
pattern = ESCAPED_QUOTE[quote_type]
match = pattern.search(expr)

if match is None:
return expr

tokens = []
while match is not None:
start, stop = match.span()
size = stop - start
# An escaped quote is made of an odd number of backslashes followed by a
# quote, so the total number of matching characters should be even.
assert size % 2 == 0
# Add characters before the escaped quote.
tokens.append(expr[0:start])
# Add the escaped quote (un-escaping the backslashes, which can be achieved
# by extracting the proper subset of the string).
tokens.append(expr[stop - size // 2 : stop])
# Move on to next match.
expr = expr[stop:]
match = pattern.search(expr)

# Add characters after the last match.
tokens.append(expr)

return "".join(tokens)

def _unescape_trailing_backslashes(self, expr: str) -> str:
"""
Un-escape trailing backslashes from `expr`.

Examples:
abc -> abc
abc\\ -> abc\
abc\\def -> abc\\def
abc\\def\\\\ -> abc\\def\\
"""
match = TRAILING_BACKSLASHES.search(expr)
if match is None:
return expr
start, end = match.span()
n_backslashes = end - start
# Sanity check: there should be an even number of backslashes at end of string.
assert n_backslashes % 2 == 0
# Un-escaping backslashes <=> removing half of them.
return expr[0 : start + n_backslashes // 2]
34 changes: 27 additions & 7 deletions tests/test_grammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,23 @@
("str_quoted_too_many_1", "''a'", GrammarParseError),
("str_quoted_too_many_2", "'a''", GrammarParseError),
("str_quoted_too_many_3", "''a''", GrammarParseError),
("str_quoted_trailing_esc_1", "'abc\\\\'", "abc\\"),
("str_quoted_trailing_esc_2", "'abc\\\\\\\\'", "abc\\\\"),
("str_quoted_no_esc_1", '"abc\\def"', "abc\\def"),
("str_quoted_no_esc_2", '"abc\\\\def"', "abc\\\\def"),
("str_quoted_no_esc_3", '"\\\\\\abc\\def"', "\\\\\\abc\\def"),
("str_quoted_bad_1", '"abc\\"', GrammarParseError),
("str_quoted_bad_2", '"abc\\\\\\"', GrammarParseError),
("str_quoted_esc_quote_single_1", "'abc\\'def'", "abc'def"),
("str_quoted_esc_quote_single_2", "'abc\\\\\\'def'", "abc\\'def"),
("str_quoted_esc_quote_single_3", "'abc\\\\\\\\\\'def'", "abc\\\\'def"),
("str_quoted_esc_quote_single_4", "'a\\'b\\'cdef\\\\\\''", "a'b'cdef\\'"),
("str_quoted_esc_quote_single_bad", "'abc\\\\'def'", GrammarParseError),
("str_quoted_esc_quote_double_1", '"abc\\"def"', 'abc"def'),
("str_quoted_esc_quote_double_2", '"abc\\\\\\"def"', 'abc\\"def'),
("str_quoted_esc_quote_double_3", '"abc\\\\\\\\\\"def"', 'abc\\\\"def'),
("str_quoted_esc_quote_double_4", '"a\\"b\\"cdef\\\\\\""', 'a"b"cdef\\"'),
("str_quoted_esc_quote_double_bad", '"abc\\\\"def"', GrammarParseError),
# Lists and dictionaries.
("list", "[0, 1]", [0, 1]),
(
Expand Down Expand Up @@ -218,9 +235,9 @@
# Interpolations in quoted strings.
("str_quoted_inter", "'${null}'", "None"),
("str_quoted_esc_single_1", r"'ab\'cd\'\'${str}'", "ab'cd''hi"),
("str_quoted_esc_single_2", "'\"\\\\\\\\\\${foo}\\ '", r'"\${foo}\ '),
("str_quoted_esc_single_2", "'\"\\\\\\${foo}\\ '", r'"\${foo}\ '),
("str_quoted_esc_double_1", r'"ab\"cd\"\"${str}"', 'ab"cd""hi'),
("str_quoted_esc_double_2", '"\'\\\\\\\\\\${foo}\\ "', r"'\${foo}\ "),
("str_quoted_esc_double_2", '"\'\\\\\\${foo}\\ "', r"'\${foo}\ "),
("str_quoted_concat_bad_1", '"Hi "${str}', GrammarParseError),
# Whitespaces.
("ws_inter_node_outer", "${ \tdict.a \t}", 0),
Expand All @@ -240,7 +257,7 @@
("nested_simple", "${${ref_str}}", "hi"),
("nested_select", "${options.${choice}}", "A"),
("nested_relative", "${${rel_opt}.b}", "B"),
("str_quoted_nested", r"'AB${test:\'CD${test:\\'EF\\'}GH\'}'", "ABCDEFGH"),
("str_quoted_nested", r"'AB${test:\'CD${test:\\\'EF\\\'}GH\'}'", "ABCDEFGH"),
# Resolver interpolations.
("no_args", "${test:}", []),
("space_in_args", "${test:a, b c}", ["a", "b c"]),
Expand Down Expand Up @@ -272,8 +289,7 @@
("dict_nan_key_2", "${first:{${test:nan}: 0}}", GrammarParseError),
]

# Parameters for tests of the "configValue" rule (may contain node
# interpolations, but no resolvers).
# Parameters for tests of the "configValue" rule (may contain interpolations).
PARAMS_CONFIG_VALUE = [
# String interpolations (top-level).
("str_top_basic", "bonjour ${str}", "bonjour hi"),
Expand All @@ -298,7 +314,9 @@
("str_top_middle_quotes_single", "I like '${str}'", "I like 'hi'"),
("str_top_middle_quotes_double", 'I like "${str}"', 'I like "hi"'),
("str_top_any_char", "${str} !@\\#$%^&*})][({,/?;", "hi !@\\#$%^&*})][({,/?;"),
("str_top_esc_inter", r"Esc: \${str}", "Esc: ${str}"),
("str_top_esc_inter_1", r"Esc: \${str}", "Esc: ${str}"),
("str_top_esc_inter_2", r"Esc: \\\${str}", r"Esc: \${str}"),
("str_top_esc_inter_3", r"abc\\${str}\${str}\\", r"abc\hi${str}\\"),
("str_top_esc_inter_wrong_1", r"Wrong: $\{str\}", r"Wrong: $\{str\}"),
("str_top_esc_inter_wrong_2", r"Wrong: \${str\}", r"Wrong: ${str\}"),
("str_top_esc_backslash", r"Esc: \\${str}", r"Esc: \hi"),
Expand All @@ -307,8 +325,9 @@
("str_top_trailing_dollars", r"${str}$$$$", "hi$$$$"),
("str_top_leading_escapes", r"\\\\\${str}", r"\\${str}"),
("str_top_middle_escapes", r"abc\\\\\${str}", r"abc\\${str}"),
("str_top_trailing_escapes", "${str}" + "\\" * 5, "hi" + "\\" * 3),
("str_top_trailing_escapes", "${str}" + "\\" * 5, "hi" + "\\" * 5),
("str_top_concat_interpolations", "${null}${float}", "None1.2"),
("str_top_issue_617", r""" ${test: "hi\\" }"} """, ' hi\\"} '),
# Whitespaces.
("ws_toplevel", " \tab ${str} cd ${int}\t", " \tab hi cd 123\t"),
# Unmatched braces.
Expand Down Expand Up @@ -376,6 +395,7 @@ def test_config_value(
self, restore_resolvers: Any, definition: str, expected: Any
) -> None:
parse_tree, expected_visit = self._parse("configValue", definition, expected)
OmegaConf.register_new_resolver("test", self._resolver_test)
self._visit_with_config(parse_tree, expected_visit)

@parametrize_from(
Expand Down