Skip to content

Commit

Permalink
[mypyc] Use mypy.FORMAT_RE and ConversionSpecifier for % interpolation (
Browse files Browse the repository at this point in the history
#10877)

mypy.checkstrformat offers regex and ConversionSpecifier for tokenizer, thus 
this PR:

* deletes the redundant code
* uses ConversionSpecifier as FormatOp
  • Loading branch information
97littleleaf11 authored Jul 29, 2021
1 parent a54a177 commit 84504b0
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 57 deletions.
49 changes: 29 additions & 20 deletions mypy/checkstrformat.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import re

from typing import (
cast, List, Tuple, Dict, Callable, Union, Optional, Pattern, Match, Set, Any
cast, List, Tuple, Dict, Callable, Union, Optional, Pattern, Match, Set
)
from typing_extensions import Final, TYPE_CHECKING

Expand Down Expand Up @@ -50,14 +50,14 @@ def compile_format_re() -> Pattern[str]:
See https://docs.python.org/3/library/stdtypes.html#printf-style-string-formatting
The regexp is intentionally a bit wider to report better errors.
"""
key_re = r'(\(([^()]*)\))?' # (optional) parenthesised sequence of characters.
flags_re = r'([#0\-+ ]*)' # (optional) sequence of flags.
width_re = r'(\*|[1-9][0-9]*)?' # (optional) minimum field width (* or numbers).
precision_re = r'(?:\.(\*|[0-9]+)?)?' # (optional) . followed by * of numbers.
key_re = r'(\((?P<key>[^)]*)\))?' # (optional) parenthesised sequence of characters.
flags_re = r'(?P<flag>[#0\-+ ]*)' # (optional) sequence of flags.
width_re = r'(?P<width>[1-9][0-9]*|\*)?' # (optional) minimum field width (* or numbers).
precision_re = r'(?:\.(?P<precision>\*|[0-9]+)?)?' # (optional) . followed by * of numbers.
length_mod_re = r'[hlL]?' # (optional) length modifier (unused).
type_re = r'(.)?' # conversion type.
type_re = r'(?P<type>.)?' # conversion type.
format_re = '%' + key_re + flags_re + width_re + precision_re + length_mod_re + type_re
return re.compile(format_re)
return re.compile('({})'.format(format_re))


def compile_new_format_re(custom_spec: bool) -> Pattern[str]:
Expand Down Expand Up @@ -114,16 +114,20 @@ def compile_new_format_re(custom_spec: bool) -> Pattern[str]:


class ConversionSpecifier:
def __init__(self, key: Optional[str],
flags: str, width: str, precision: str, type: str,
def __init__(self, type: str,
key: Optional[str],
flags: Optional[str],
width: Optional[str],
precision: Optional[str],
format_spec: Optional[str] = None,
conversion: Optional[str] = None,
field: Optional[str] = None) -> None:
field: Optional[str] = None,
whole_seq: Optional[str] = None) -> None:
self.type = type
self.key = key
self.flags = flags
self.width = width
self.precision = precision
self.type = type
# Used only for str.format() calls (it may be custom for types with __format__()).
self.format_spec = format_spec
self.non_standard_format_spec = False
Expand All @@ -132,24 +136,27 @@ def __init__(self, key: Optional[str],
# Full formatted expression (i.e. key plus following attributes and/or indexes).
# Used only for str.format() calls.
self.field = field
self.whole_seq = whole_seq

@classmethod
def from_match(cls, match_obj: Match[str],
def from_match(cls, match: Match[str],
non_standard_spec: bool = False) -> 'ConversionSpecifier':
"""Construct specifier from match object resulted from parsing str.format() call."""
match = cast(Any, match_obj) # TODO: remove this once typeshed is fixed.
if non_standard_spec:
spec = cls(match.group('key'),
flags='', width='', precision='', type='',
spec = cls(type='',
key=match.group('key'),
flags='', width='', precision='',
format_spec=match.group('format_spec'),
conversion=match.group('conversion'),
field=match.group('field'))
spec.non_standard_format_spec = True
return spec
# Replace unmatched optional groups with empty matches (for convenience).
return cls(match.group('key'),
flags=match.group('flags') or '', width=match.group('width') or '',
precision=match.group('precision') or '', type=match.group('type') or '',
return cls(type=match.group('type') or '',
key=match.group('key'),
flags=match.group('flags') or '',
width=match.group('width') or '',
precision=match.group('precision') or '',
format_spec=match.group('format_spec'),
conversion=match.group('conversion'),
field=match.group('field'))
Expand Down Expand Up @@ -622,10 +629,12 @@ def check_str_interpolation(self,

def parse_conversion_specifiers(self, format: str) -> List[ConversionSpecifier]:
specifiers: List[ConversionSpecifier] = []
for parens_key, key, flags, width, precision, type in FORMAT_RE.findall(format):
for whole_seq, parens_key, key, flags, width, precision, type \
in FORMAT_RE.findall(format):
if parens_key == '':
key = None
specifiers.append(ConversionSpecifier(key, flags, width, precision, type))
specifiers.append(ConversionSpecifier(type, key, flags, width, precision,
whole_seq=whole_seq))
return specifiers

def analyze_conversion_specifiers(self, specifiers: List[ConversionSpecifier],
Expand Down
14 changes: 8 additions & 6 deletions mypyc/irbuild/expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -569,7 +569,8 @@ def transform_basic_comparison(builder: IRBuilder,
def translate_str_format_percent_sign(builder: IRBuilder,
format_expr: StrExpr,
rhs: Expression) -> Value:
literals, conversion_types = tokenizer_printf_style(format_expr.value)
literals, conversion_specifiers = tokenizer_printf_style(format_expr.value)

variables = []
if isinstance(rhs, TupleExpr):
raw_variables = rhs.items
Expand All @@ -578,15 +579,16 @@ def translate_str_format_percent_sign(builder: IRBuilder,
else:
raw_variables = []

is_conversion_matched = (len(conversion_types) == len(raw_variables))
is_conversion_matched = (len(conversion_specifiers) == len(raw_variables))

if is_conversion_matched:
for typ, var in zip(conversion_types, raw_variables):
for specifier, var in zip(conversion_specifiers, raw_variables):
node_type = builder.node_type(var)
if typ == '%d' and (is_int_rprimitive(node_type)
or is_short_int_rprimitive(node_type)):
format_type = specifier.whole_seq
if format_type == '%d' and (is_int_rprimitive(node_type)
or is_short_int_rprimitive(node_type)):
var_str = builder.call_c(int_to_str_op, [builder.accept(var)], format_expr.line)
elif typ == '%s':
elif format_type == '%s':
if is_str_rprimitive(node_type):
var_str = builder.accept(var)
else:
Expand Down
35 changes: 13 additions & 22 deletions mypyc/irbuild/format_str_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,46 +3,37 @@
import re
from typing import List, Tuple

from mypy.checkstrformat import (
FORMAT_RE, ConversionSpecifier
)
from mypyc.ir.ops import Value, Integer
from mypyc.ir.rtypes import c_pyssize_t_rprimitive
from mypyc.irbuild.builder import IRBuilder
from mypyc.primitives.str_ops import str_build_op

# printf-style String Formatting:
# https://docs.python.org/3/library/stdtypes.html#old-string-formatting
printf_style_pattern = re.compile(r"""
(
% # Start sign
(?:\((?P<key>[^)]*)\))? # Optional: Mapping key
(?P<flag>[-+#0 ]+)? # Optional: Conversion flags
(?P<width>\d+|\*)? # Optional: Minimum field width
(?:\.(?P<precision>\d+|\*))? # Optional: Precision
[hlL]? # Optional: Length modifier, Ignored
(?P<type>[diouxXeEfFgGcrsa]) # Conversion type
| %%)
""", re.VERBOSE)


def tokenizer_printf_style(format_str: str) -> Tuple[List[str], List[str]]:
def tokenizer_printf_style(format_str: str) -> Tuple[List[str], List[ConversionSpecifier]]:
"""Tokenize a printf-style format string using regex.
Return:
A list of string literals and a list of conversion operations
"""
literals = []
format_ops = []
literals: List[str] = []
specifiers: List[ConversionSpecifier] = []
last_end = 0

for m in re.finditer(printf_style_pattern, format_str):
for m in re.finditer(FORMAT_RE, format_str):
whole_seq, parens_key, key, flags, width, precision, conversion_type = m.groups()
specifiers.append(ConversionSpecifier(conversion_type, key, flags, width, precision,
whole_seq=whole_seq))

cur_start = m.start(1)
format_tmp = m.group(1)
literals.append(format_str[last_end:cur_start])
format_ops.append(format_tmp)
last_end = cur_start + len(format_tmp)
last_end = cur_start + len(whole_seq)

literals.append(format_str[last_end:])

return literals, format_ops
return literals, specifiers


def join_formatted_strings(builder: IRBuilder, literals: List[str],
Expand Down
33 changes: 24 additions & 9 deletions mypyc/test/test_stringformatting.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,31 @@
import unittest
from typing import List

from mypyc.irbuild.format_str_tokenizer import tokenizer_printf_style


class TestStringFormatting(unittest.TestCase):

def test_tokenizer_printf_style(self) -> None:
assert tokenizer_printf_style("I'm %s, id years old") == \
(["I'm ", ', id years old'], ['%s'])
assert tokenizer_printf_style("Test: %i%%, Test: %02d, Test: %.2f") == \
(['Test: ', '', ', Test: ', ', Test: ', ''], ['%i', '%%', '%02d', '%.2f'])
assert tokenizer_printf_style("ioasdfyuia%i%%%20s%d%sdafafadfa%s%d%x%E%.2f") == \
(['ioasdfyuia', '', '', '', '', 'dafafadfa', '', '', '', '', ''],
['%i', '%%', '%20s', '%d', '%s', '%s', '%d', '%x', '%E', '%.2f'])
assert tokenizer_printf_style("Special: %#20.2f%d, test: ") == \
(['Special: ', '', ', test: '], ['%#20.2f', '%d'])

def tokenizer_printf_style_helper(format_str: str,
literals: List[str], conversion: List[str]) -> bool:
l, specs = tokenizer_printf_style(format_str)
return literals == l and conversion == [x.whole_seq for x in specs]

assert tokenizer_printf_style_helper(
"I'm %s, id years old",
["I'm ", ', id years old'],
['%s'])
assert tokenizer_printf_style_helper(
"Test: %i%%, Test: %02d, Test: %.2f",
['Test: ', '', ', Test: ', ', Test: ', ''],
['%i', '%%', '%02d', '%.2f'])
assert tokenizer_printf_style_helper(
"ioasdfyuia%i%%%20s%d%sdafafadfa%s%d%x%E%.2f",
['ioasdfyuia', '', '', '', '', 'dafafadfa', '', '', '', '', ''],
['%i', '%%', '%20s', '%d', '%s', '%s', '%d', '%x', '%E', '%.2f'])
assert tokenizer_printf_style_helper(
"Special: %#20.2f%d, test: ",
['Special: ', '', ', test: '],
['%#20.2f', '%d'])

0 comments on commit 84504b0

Please sign in to comment.