From 84504b093769de6d3713438a12264a4362bd43c6 Mon Sep 17 00:00:00 2001 From: 97littleleaf11 <97littleleaf11@users.noreply.github.com> Date: Thu, 29 Jul 2021 18:32:21 +0800 Subject: [PATCH] [mypyc] Use mypy.FORMAT_RE and ConversionSpecifier for % interpolation (#10877) mypy.checkstrformat offers regex and ConversionSpecifier for tokenizer, thus this PR: * deletes the redundant code * uses ConversionSpecifier as FormatOp --- mypy/checkstrformat.py | 49 ++++++++++++++++----------- mypyc/irbuild/expression.py | 14 ++++---- mypyc/irbuild/format_str_tokenizer.py | 35 +++++++------------ mypyc/test/test_stringformatting.py | 33 +++++++++++++----- 4 files changed, 74 insertions(+), 57 deletions(-) diff --git a/mypy/checkstrformat.py b/mypy/checkstrformat.py index adf1ba8c071f..302f077b5bd9 100644 --- a/mypy/checkstrformat.py +++ b/mypy/checkstrformat.py @@ -13,7 +13,7 @@ import re from typing import ( - cast, List, Tuple, Dict, Callable, Union, Optional, Pattern, Match, Set, Any + cast, List, Tuple, Dict, Callable, Union, Optional, Pattern, Match, Set ) from typing_extensions import Final, TYPE_CHECKING @@ -50,14 +50,14 @@ def compile_format_re() -> Pattern[str]: See https://docs.python.org/3/library/stdtypes.html#printf-style-string-formatting The regexp is intentionally a bit wider to report better errors. """ - key_re = r'(\(([^()]*)\))?' # (optional) parenthesised sequence of characters. - flags_re = r'([#0\-+ ]*)' # (optional) sequence of flags. - width_re = r'(\*|[1-9][0-9]*)?' # (optional) minimum field width (* or numbers). - precision_re = r'(?:\.(\*|[0-9]+)?)?' # (optional) . followed by * of numbers. + key_re = r'(\((?P[^)]*)\))?' # (optional) parenthesised sequence of characters. + flags_re = r'(?P[#0\-+ ]*)' # (optional) sequence of flags. + width_re = r'(?P[1-9][0-9]*|\*)?' # (optional) minimum field width (* or numbers). + precision_re = r'(?:\.(?P\*|[0-9]+)?)?' # (optional) . followed by * of numbers. length_mod_re = r'[hlL]?' # (optional) length modifier (unused). - type_re = r'(.)?' # conversion type. + type_re = r'(?P.)?' # conversion type. format_re = '%' + key_re + flags_re + width_re + precision_re + length_mod_re + type_re - return re.compile(format_re) + return re.compile('({})'.format(format_re)) def compile_new_format_re(custom_spec: bool) -> Pattern[str]: @@ -114,16 +114,20 @@ def compile_new_format_re(custom_spec: bool) -> Pattern[str]: class ConversionSpecifier: - def __init__(self, key: Optional[str], - flags: str, width: str, precision: str, type: str, + def __init__(self, type: str, + key: Optional[str], + flags: Optional[str], + width: Optional[str], + precision: Optional[str], format_spec: Optional[str] = None, conversion: Optional[str] = None, - field: Optional[str] = None) -> None: + field: Optional[str] = None, + whole_seq: Optional[str] = None) -> None: + self.type = type self.key = key self.flags = flags self.width = width self.precision = precision - self.type = type # Used only for str.format() calls (it may be custom for types with __format__()). self.format_spec = format_spec self.non_standard_format_spec = False @@ -132,24 +136,27 @@ def __init__(self, key: Optional[str], # Full formatted expression (i.e. key plus following attributes and/or indexes). # Used only for str.format() calls. self.field = field + self.whole_seq = whole_seq @classmethod - def from_match(cls, match_obj: Match[str], + def from_match(cls, match: Match[str], non_standard_spec: bool = False) -> 'ConversionSpecifier': """Construct specifier from match object resulted from parsing str.format() call.""" - match = cast(Any, match_obj) # TODO: remove this once typeshed is fixed. if non_standard_spec: - spec = cls(match.group('key'), - flags='', width='', precision='', type='', + spec = cls(type='', + key=match.group('key'), + flags='', width='', precision='', format_spec=match.group('format_spec'), conversion=match.group('conversion'), field=match.group('field')) spec.non_standard_format_spec = True return spec # Replace unmatched optional groups with empty matches (for convenience). - return cls(match.group('key'), - flags=match.group('flags') or '', width=match.group('width') or '', - precision=match.group('precision') or '', type=match.group('type') or '', + return cls(type=match.group('type') or '', + key=match.group('key'), + flags=match.group('flags') or '', + width=match.group('width') or '', + precision=match.group('precision') or '', format_spec=match.group('format_spec'), conversion=match.group('conversion'), field=match.group('field')) @@ -622,10 +629,12 @@ def check_str_interpolation(self, def parse_conversion_specifiers(self, format: str) -> List[ConversionSpecifier]: specifiers: List[ConversionSpecifier] = [] - for parens_key, key, flags, width, precision, type in FORMAT_RE.findall(format): + for whole_seq, parens_key, key, flags, width, precision, type \ + in FORMAT_RE.findall(format): if parens_key == '': key = None - specifiers.append(ConversionSpecifier(key, flags, width, precision, type)) + specifiers.append(ConversionSpecifier(type, key, flags, width, precision, + whole_seq=whole_seq)) return specifiers def analyze_conversion_specifiers(self, specifiers: List[ConversionSpecifier], diff --git a/mypyc/irbuild/expression.py b/mypyc/irbuild/expression.py index e8833830e585..4833a728eb4e 100644 --- a/mypyc/irbuild/expression.py +++ b/mypyc/irbuild/expression.py @@ -569,7 +569,8 @@ def transform_basic_comparison(builder: IRBuilder, def translate_str_format_percent_sign(builder: IRBuilder, format_expr: StrExpr, rhs: Expression) -> Value: - literals, conversion_types = tokenizer_printf_style(format_expr.value) + literals, conversion_specifiers = tokenizer_printf_style(format_expr.value) + variables = [] if isinstance(rhs, TupleExpr): raw_variables = rhs.items @@ -578,15 +579,16 @@ def translate_str_format_percent_sign(builder: IRBuilder, else: raw_variables = [] - is_conversion_matched = (len(conversion_types) == len(raw_variables)) + is_conversion_matched = (len(conversion_specifiers) == len(raw_variables)) if is_conversion_matched: - for typ, var in zip(conversion_types, raw_variables): + for specifier, var in zip(conversion_specifiers, raw_variables): node_type = builder.node_type(var) - if typ == '%d' and (is_int_rprimitive(node_type) - or is_short_int_rprimitive(node_type)): + format_type = specifier.whole_seq + if format_type == '%d' and (is_int_rprimitive(node_type) + or is_short_int_rprimitive(node_type)): var_str = builder.call_c(int_to_str_op, [builder.accept(var)], format_expr.line) - elif typ == '%s': + elif format_type == '%s': if is_str_rprimitive(node_type): var_str = builder.accept(var) else: diff --git a/mypyc/irbuild/format_str_tokenizer.py b/mypyc/irbuild/format_str_tokenizer.py index 5354d3ddc2f6..a09c1dffb597 100644 --- a/mypyc/irbuild/format_str_tokenizer.py +++ b/mypyc/irbuild/format_str_tokenizer.py @@ -3,46 +3,37 @@ import re from typing import List, Tuple +from mypy.checkstrformat import ( + FORMAT_RE, ConversionSpecifier +) from mypyc.ir.ops import Value, Integer from mypyc.ir.rtypes import c_pyssize_t_rprimitive from mypyc.irbuild.builder import IRBuilder from mypyc.primitives.str_ops import str_build_op -# printf-style String Formatting: -# https://docs.python.org/3/library/stdtypes.html#old-string-formatting -printf_style_pattern = re.compile(r""" - ( - % # Start sign - (?:\((?P[^)]*)\))? # Optional: Mapping key - (?P[-+#0 ]+)? # Optional: Conversion flags - (?P\d+|\*)? # Optional: Minimum field width - (?:\.(?P\d+|\*))? # Optional: Precision - [hlL]? # Optional: Length modifier, Ignored - (?P[diouxXeEfFgGcrsa]) # Conversion type - | %%) - """, re.VERBOSE) - -def tokenizer_printf_style(format_str: str) -> Tuple[List[str], List[str]]: +def tokenizer_printf_style(format_str: str) -> Tuple[List[str], List[ConversionSpecifier]]: """Tokenize a printf-style format string using regex. Return: A list of string literals and a list of conversion operations """ - literals = [] - format_ops = [] + literals: List[str] = [] + specifiers: List[ConversionSpecifier] = [] last_end = 0 - for m in re.finditer(printf_style_pattern, format_str): + for m in re.finditer(FORMAT_RE, format_str): + whole_seq, parens_key, key, flags, width, precision, conversion_type = m.groups() + specifiers.append(ConversionSpecifier(conversion_type, key, flags, width, precision, + whole_seq=whole_seq)) + cur_start = m.start(1) - format_tmp = m.group(1) literals.append(format_str[last_end:cur_start]) - format_ops.append(format_tmp) - last_end = cur_start + len(format_tmp) + last_end = cur_start + len(whole_seq) literals.append(format_str[last_end:]) - return literals, format_ops + return literals, specifiers def join_formatted_strings(builder: IRBuilder, literals: List[str], diff --git a/mypyc/test/test_stringformatting.py b/mypyc/test/test_stringformatting.py index dc7611f6d25f..77c0a95ab60c 100644 --- a/mypyc/test/test_stringformatting.py +++ b/mypyc/test/test_stringformatting.py @@ -1,16 +1,31 @@ import unittest +from typing import List from mypyc.irbuild.format_str_tokenizer import tokenizer_printf_style class TestStringFormatting(unittest.TestCase): + def test_tokenizer_printf_style(self) -> None: - assert tokenizer_printf_style("I'm %s, id years old") == \ - (["I'm ", ', id years old'], ['%s']) - assert tokenizer_printf_style("Test: %i%%, Test: %02d, Test: %.2f") == \ - (['Test: ', '', ', Test: ', ', Test: ', ''], ['%i', '%%', '%02d', '%.2f']) - assert tokenizer_printf_style("ioasdfyuia%i%%%20s%d%sdafafadfa%s%d%x%E%.2f") == \ - (['ioasdfyuia', '', '', '', '', 'dafafadfa', '', '', '', '', ''], - ['%i', '%%', '%20s', '%d', '%s', '%s', '%d', '%x', '%E', '%.2f']) - assert tokenizer_printf_style("Special: %#20.2f%d, test: ") == \ - (['Special: ', '', ', test: '], ['%#20.2f', '%d']) + + def tokenizer_printf_style_helper(format_str: str, + literals: List[str], conversion: List[str]) -> bool: + l, specs = tokenizer_printf_style(format_str) + return literals == l and conversion == [x.whole_seq for x in specs] + + assert tokenizer_printf_style_helper( + "I'm %s, id years old", + ["I'm ", ', id years old'], + ['%s']) + assert tokenizer_printf_style_helper( + "Test: %i%%, Test: %02d, Test: %.2f", + ['Test: ', '', ', Test: ', ', Test: ', ''], + ['%i', '%%', '%02d', '%.2f']) + assert tokenizer_printf_style_helper( + "ioasdfyuia%i%%%20s%d%sdafafadfa%s%d%x%E%.2f", + ['ioasdfyuia', '', '', '', '', 'dafafadfa', '', '', '', '', ''], + ['%i', '%%', '%20s', '%d', '%s', '%s', '%d', '%x', '%E', '%.2f']) + assert tokenizer_printf_style_helper( + "Special: %#20.2f%d, test: ", + ['Special: ', '', ', test: '], + ['%#20.2f', '%d'])