From 1cd02c470b49162b306de47729601bd21ba29c46 Mon Sep 17 00:00:00 2001 From: dgw Date: Sat, 14 Sep 2024 07:53:47 -0500 Subject: [PATCH 1/3] irc.utils, test: also remove null (\x00) in safe() In `test_irc_utils` suite, rewrote `test_safe` and `test_safe_bytes` to generate a bunch more permutations of invalid characters automatically rather than writing them all out. The horse is definitely beaten *far* past death at this point... but the tests sure are thorough(bred). Also renamed `test_safe_null` -> `test_safe_none` to avoid confusion; `test_safe` and `test_safe_bytes` are the cases that check behavior with null characters/bytes. Python's docs technically do refer to `None` as the "null object" once or twice, but most of the docs (and most of us devs) simply call it `None`. --- sopel/irc/utils.py | 8 ++++--- test/irc/test_irc_utils.py | 49 ++++++++++++++++++++++---------------- 2 files changed, 34 insertions(+), 23 deletions(-) diff --git a/sopel/irc/utils.py b/sopel/irc/utils.py index 12ba094618..3d3bf450f1 100644 --- a/sopel/irc/utils.py +++ b/sopel/irc/utils.py @@ -25,10 +25,11 @@ def safe(string): :rtype: str :raises TypeError: when ``string`` is ``None`` - This function removes newlines from a string and always returns a unicode - string (``str``), but doesn't strip or alter it in any other way:: + This function removes newlines and null-bytes from a string. It will always + return a Unicode ``str``, even if given non-Unicode input, but doesn't strip + or alter the string in any other way:: - >>> safe('some text\\r\\n') + >>> safe('some \x00text\\r\\n') 'some text' This is useful to ensure a string can be used in a IRC message. @@ -45,6 +46,7 @@ def safe(string): string = string.decode("utf8") string = string.replace('\n', '') string = string.replace('\r', '') + string = string.replace('\x00', '') return string diff --git a/test/irc/test_irc_utils.py b/test/irc/test_irc_utils.py index 94eab62986..3204444e20 100644 --- a/test/irc/test_irc_utils.py +++ b/test/irc/test_irc_utils.py @@ -1,6 +1,8 @@ """Tests for core ``sopel.irc.utils``""" from __future__ import annotations +from itertools import permutations + import pytest from sopel.irc import utils @@ -8,15 +10,19 @@ def test_safe(): text = 'some text' - assert utils.safe(text + '\r\n') == text - assert utils.safe(text + '\n') == text - assert utils.safe(text + '\r') == text - assert utils.safe('\r\n' + text) == text - assert utils.safe('\n' + text) == text - assert utils.safe('\r' + text) == text - assert utils.safe('some \r\ntext') == text - assert utils.safe('some \ntext') == text - assert utils.safe('some \rtext') == text + variants = permutations(('\n', '\r', '\x00')) + for variant in variants: + seq = ''.join(variant) + assert utils.safe(text + seq) == text + assert utils.safe(seq + text) == text + assert utils.safe('some ' + seq + 'text') == text + assert utils.safe( + variant[0] + + 'some ' + + variant[1] + + 'text' + + variant[2] + ) == text def test_safe_empty(): @@ -24,20 +30,23 @@ def test_safe_empty(): assert utils.safe(text) == text -def test_safe_null(): +def test_safe_none(): with pytest.raises(TypeError): utils.safe(None) def test_safe_bytes(): text = b'some text' - assert utils.safe(text) == text.decode('utf-8') - assert utils.safe(text + b'\r\n') == text.decode('utf-8') - assert utils.safe(text + b'\n') == text.decode('utf-8') - assert utils.safe(text + b'\r') == text.decode('utf-8') - assert utils.safe(b'\r\n' + text) == text.decode('utf-8') - assert utils.safe(b'\n' + text) == text.decode('utf-8') - assert utils.safe(b'\r' + text) == text.decode('utf-8') - assert utils.safe(b'some \r\ntext') == text.decode('utf-8') - assert utils.safe(b'some \ntext') == text.decode('utf-8') - assert utils.safe(b'some \rtext') == text.decode('utf-8') + variants = permutations((b'\n', b'\r', b'\x00')) + for variant in variants: + seq = b''.join(variant) + assert utils.safe(text + seq) == text.decode('utf-8') + assert utils.safe(seq + text) == text.decode('utf-8') + assert utils.safe(b'some ' + seq + b'text') == text.decode('utf-8') + assert utils.safe( + variant[0] + + b'some ' + + variant[1] + + b'text' + + variant[2] + ) == text.decode('utf-8') From 0e4418691ba4a6af1fdf56e6224b61270af8643f Mon Sep 17 00:00:00 2001 From: dgw Date: Sun, 15 Sep 2024 14:20:14 -0500 Subject: [PATCH 2/3] test: eliminate for...in loops in test_irc_utils suite Using `pytest.mark.parametrize()` instead. Two test cases become 12. Note: Yes, it is possible to have pytest treat each tuple generated by `itertools.permutations()` as a single argument. Doing so yields useless generated test-case names like `test_safe[seq0]`, though. Using three separate test parameters of simple types (`str` or `bytes`) yields more informative names like `test_safe[\n-\r-\x00]`. --- test/irc/test_irc_utils.py | 56 +++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/test/irc/test_irc_utils.py b/test/irc/test_irc_utils.py index 3204444e20..47f2e3d9a3 100644 --- a/test/irc/test_irc_utils.py +++ b/test/irc/test_irc_utils.py @@ -8,21 +8,21 @@ from sopel.irc import utils -def test_safe(): +@pytest.mark.parametrize('s1, s2, s3', permutations(('\n', '\r', '\x00'))) +def test_safe(s1, s2, s3): text = 'some text' - variants = permutations(('\n', '\r', '\x00')) - for variant in variants: - seq = ''.join(variant) - assert utils.safe(text + seq) == text - assert utils.safe(seq + text) == text - assert utils.safe('some ' + seq + 'text') == text - assert utils.safe( - variant[0] - + 'some ' - + variant[1] - + 'text' - + variant[2] - ) == text + seq = ''.join((s1, s2, s3)) + + assert utils.safe(text + seq) == text + assert utils.safe(seq + text) == text + assert utils.safe('some ' + seq + 'text') == text + assert utils.safe( + s1 + + 'some ' + + s2 + + 'text' + + s3 + ) == text def test_safe_empty(): @@ -35,18 +35,18 @@ def test_safe_none(): utils.safe(None) -def test_safe_bytes(): +@pytest.mark.parametrize('b1, b2, b3', permutations((b'\n', b'\r', b'\x00'))) +def test_safe_bytes(b1, b2, b3): text = b'some text' - variants = permutations((b'\n', b'\r', b'\x00')) - for variant in variants: - seq = b''.join(variant) - assert utils.safe(text + seq) == text.decode('utf-8') - assert utils.safe(seq + text) == text.decode('utf-8') - assert utils.safe(b'some ' + seq + b'text') == text.decode('utf-8') - assert utils.safe( - variant[0] - + b'some ' - + variant[1] - + b'text' - + variant[2] - ) == text.decode('utf-8') + seq = b''.join((b1, b2, b3)) + + assert utils.safe(text + seq) == text.decode('utf-8') + assert utils.safe(seq + text) == text.decode('utf-8') + assert utils.safe(b'some ' + seq + b'text') == text.decode('utf-8') + assert utils.safe( + b1 + + b'some ' + + b2 + + b'text' + + b3 + ) == text.decode('utf-8') From db4b2962a97987c6338a8991623c4669a2a32a7e Mon Sep 17 00:00:00 2001 From: dgw Date: Sun, 13 Oct 2024 12:12:04 -0500 Subject: [PATCH 3/3] irc.utils: improve `safe()` docstring Link to RFC explaining why CR, LF, and NUL are prohibited, and annotate the version in which NUL stripping was added. --- sopel/irc/utils.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/sopel/irc/utils.py b/sopel/irc/utils.py index 3d3bf450f1..a812f96876 100644 --- a/sopel/irc/utils.py +++ b/sopel/irc/utils.py @@ -17,28 +17,32 @@ from sopel.lifecycle import deprecated -def safe(string): - """Remove newlines from a string. +def safe(string: str) -> str: + """Remove disallowed bytes from a string, and ensure Unicode. - :param str string: input text to process - :return: the string without newlines - :rtype: str + :param string: input text to process + :return: the string as Unicode without characters prohibited in IRC messages :raises TypeError: when ``string`` is ``None`` - This function removes newlines and null-bytes from a string. It will always + This function removes newlines and null bytes from a string. It will always return a Unicode ``str``, even if given non-Unicode input, but doesn't strip or alter the string in any other way:: - >>> safe('some \x00text\\r\\n') + >>> safe('some \\x00text\\r\\n') 'some text' - This is useful to ensure a string can be used in a IRC message. + This is useful to ensure a string can be used in a IRC message. Parameters + can **never** contain NUL, CR, or LF octets, per :rfc:`2812#section-2.3.1`. .. versionchanged:: 7.1 This function now raises a :exc:`TypeError` instead of an unpredictable behaviour when given ``None``. + .. versionchanged:: 8.0.1 + + Also remove NUL (``\\x00``) in addition to CR/LF. + """ if string is None: raise TypeError('safe function requires a string, not NoneType')