From eaf217108226633c03cc5c4c90f0b6e4587c8803 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 21 Nov 2024 13:15:12 +0200 Subject: [PATCH] gh-126997: Fix support of non-ASCII strings in pickletools (GH-127062) * Fix support of STRING and GLOBAL opcodes with non-ASCII arguments. * dis() now outputs non-ASCII bytes in STRING, BINSTRING and SHORT_BINSTRING arguments as escaped (\xXX). --- Lib/pickletools.py | 11 ++- Lib/test/test_pickletools.py | 82 +++++++++++++++++++ ...-11-20-16-58-59.gh-issue-126997.0PI41Y.rst | 3 + 3 files changed, 92 insertions(+), 4 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-11-20-16-58-59.gh-issue-126997.0PI41Y.rst diff --git a/Lib/pickletools.py b/Lib/pickletools.py index c462d26da97ce1..d9c4fb1e63e91a 100644 --- a/Lib/pickletools.py +++ b/Lib/pickletools.py @@ -312,7 +312,7 @@ def read_uint8(f): doc="Eight-byte unsigned integer, little-endian.") -def read_stringnl(f, decode=True, stripquotes=True): +def read_stringnl(f, decode=True, stripquotes=True, *, encoding='latin-1'): r""" >>> import io >>> read_stringnl(io.BytesIO(b"'abcd'\nefg\n")) @@ -356,7 +356,7 @@ def read_stringnl(f, decode=True, stripquotes=True): raise ValueError("no string quotes around %r" % data) if decode: - data = codecs.escape_decode(data)[0].decode("ascii") + data = codecs.escape_decode(data)[0].decode(encoding) return data stringnl = ArgumentDescriptor( @@ -370,7 +370,7 @@ def read_stringnl(f, decode=True, stripquotes=True): """) def read_stringnl_noescape(f): - return read_stringnl(f, stripquotes=False) + return read_stringnl(f, stripquotes=False, encoding='utf-8') stringnl_noescape = ArgumentDescriptor( name='stringnl_noescape', @@ -2509,7 +2509,10 @@ def dis(pickle, out=None, memo=None, indentlevel=4, annotate=0): # make a mild effort to align arguments line += ' ' * (10 - len(opcode.name)) if arg is not None: - line += ' ' + repr(arg) + if opcode.name in ("STRING", "BINSTRING", "SHORT_BINSTRING"): + line += ' ' + ascii(arg) + else: + line += ' ' + repr(arg) if markmsg: line += ' ' + markmsg if annotate: diff --git a/Lib/test/test_pickletools.py b/Lib/test/test_pickletools.py index d8ff7a25cbc4b7..265dc497ccb86c 100644 --- a/Lib/test/test_pickletools.py +++ b/Lib/test/test_pickletools.py @@ -361,6 +361,88 @@ def test_annotate(self): highest protocol among opcodes = 0 ''', annotate=20) + def test_string(self): + self.check_dis(b"S'abc'\n.", '''\ + 0: S STRING 'abc' + 7: . STOP +highest protocol among opcodes = 0 +''') + self.check_dis(b'S"abc"\n.', '''\ + 0: S STRING 'abc' + 7: . STOP +highest protocol among opcodes = 0 +''') + self.check_dis(b"S'\xc3\xb5'\n.", '''\ + 0: S STRING '\\xc3\\xb5' + 6: . STOP +highest protocol among opcodes = 0 +''') + + def test_string_without_quotes(self): + self.check_dis_error(b"Sabc'\n.", '', + 'no string quotes around b"abc\'"') + self.check_dis_error(b'Sabc"\n.', '', + "no string quotes around b'abc\"'") + self.check_dis_error(b"S'abc\n.", '', + '''strinq quote b"'" not found at both ends of b"'abc"''') + self.check_dis_error(b'S"abc\n.', '', + r"""strinq quote b'"' not found at both ends of b'"abc'""") + self.check_dis_error(b"S'abc\"\n.", '', + r"""strinq quote b"'" not found at both ends of b'\\'abc"'""") + self.check_dis_error(b"S\"abc'\n.", '', + r"""strinq quote b'"' not found at both ends of b'"abc\\''""") + + def test_binstring(self): + self.check_dis(b"T\x03\x00\x00\x00abc.", '''\ + 0: T BINSTRING 'abc' + 8: . STOP +highest protocol among opcodes = 1 +''') + self.check_dis(b"T\x02\x00\x00\x00\xc3\xb5.", '''\ + 0: T BINSTRING '\\xc3\\xb5' + 7: . STOP +highest protocol among opcodes = 1 +''') + + def test_short_binstring(self): + self.check_dis(b"U\x03abc.", '''\ + 0: U SHORT_BINSTRING 'abc' + 5: . STOP +highest protocol among opcodes = 1 +''') + self.check_dis(b"U\x02\xc3\xb5.", '''\ + 0: U SHORT_BINSTRING '\\xc3\\xb5' + 4: . STOP +highest protocol among opcodes = 1 +''') + + def test_global(self): + self.check_dis(b"cmodule\nname\n.", '''\ + 0: c GLOBAL 'module name' + 13: . STOP +highest protocol among opcodes = 0 +''') + self.check_dis(b"cm\xc3\xb6dule\nn\xc3\xa4me\n.", '''\ + 0: c GLOBAL 'm\xf6dule n\xe4me' + 15: . STOP +highest protocol among opcodes = 0 +''') + + def test_inst(self): + self.check_dis(b"(imodule\nname\n.", '''\ + 0: ( MARK + 1: i INST 'module name' (MARK at 0) + 14: . STOP +highest protocol among opcodes = 0 +''') + + def test_persid(self): + self.check_dis(b"Pabc\n.", '''\ + 0: P PERSID 'abc' + 5: . STOP +highest protocol among opcodes = 0 +''') + class MiscTestCase(unittest.TestCase): def test__all__(self): diff --git a/Misc/NEWS.d/next/Library/2024-11-20-16-58-59.gh-issue-126997.0PI41Y.rst b/Misc/NEWS.d/next/Library/2024-11-20-16-58-59.gh-issue-126997.0PI41Y.rst new file mode 100644 index 00000000000000..b85c51ef07dcbe --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-11-20-16-58-59.gh-issue-126997.0PI41Y.rst @@ -0,0 +1,3 @@ +Fix support of STRING and GLOBAL opcodes with non-ASCII arguments in +:mod:`pickletools`. :func:`pickletools.dis` now outputs non-ASCII bytes in +STRING, BINSTRING and SHORT_BINSTRING arguments as escaped (``\xXX``).