From 1cd02c470b49162b306de47729601bd21ba29c46 Mon Sep 17 00:00:00 2001
From: dgw <dgw@technobabbl.es>
Date: Sat, 14 Sep 2024 07:53:47 -0500
Subject: [PATCH 1/3] irc.utils, test: also remove null (\x00) in safe()

In `test_irc_utils` suite, rewrote `test_safe` and `test_safe_bytes` to
generate a bunch more permutations of invalid characters automatically
rather than writing them all out. The horse is definitely beaten *far*
past death at this point... but the tests sure are thorough(bred).

Also renamed `test_safe_null` -> `test_safe_none` to avoid confusion;
`test_safe` and `test_safe_bytes` are the cases that check behavior with
null characters/bytes. Python's docs technically do refer to `None` as
the "null object" once or twice, but most of the docs (and most of us
devs) simply call it `None`.
---
 sopel/irc/utils.py         |  8 ++++---
 test/irc/test_irc_utils.py | 49 ++++++++++++++++++++++----------------
 2 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/sopel/irc/utils.py b/sopel/irc/utils.py
index 12ba094618..3d3bf450f1 100644
--- a/sopel/irc/utils.py
+++ b/sopel/irc/utils.py
@@ -25,10 +25,11 @@ def safe(string):
     :rtype: str
     :raises TypeError: when ``string`` is ``None``
 
-    This function removes newlines from a string and always returns a unicode
-    string (``str``), but doesn't strip or alter it in any other way::
+    This function removes newlines and null-bytes from a string. It will always
+    return a Unicode ``str``, even if given non-Unicode input, but doesn't strip
+    or alter the string in any other way::
 
-        >>> safe('some text\\r\\n')
+        >>> safe('some \x00text\\r\\n')
         'some text'
 
     This is useful to ensure a string can be used in a IRC message.
@@ -45,6 +46,7 @@ def safe(string):
         string = string.decode("utf8")
     string = string.replace('\n', '')
     string = string.replace('\r', '')
+    string = string.replace('\x00', '')
     return string
 
 
diff --git a/test/irc/test_irc_utils.py b/test/irc/test_irc_utils.py
index 94eab62986..3204444e20 100644
--- a/test/irc/test_irc_utils.py
+++ b/test/irc/test_irc_utils.py
@@ -1,6 +1,8 @@
 """Tests for core ``sopel.irc.utils``"""
 from __future__ import annotations
 
+from itertools import permutations
+
 import pytest
 
 from sopel.irc import utils
@@ -8,15 +10,19 @@
 
 def test_safe():
     text = 'some text'
-    assert utils.safe(text + '\r\n') == text
-    assert utils.safe(text + '\n') == text
-    assert utils.safe(text + '\r') == text
-    assert utils.safe('\r\n' + text) == text
-    assert utils.safe('\n' + text) == text
-    assert utils.safe('\r' + text) == text
-    assert utils.safe('some \r\ntext') == text
-    assert utils.safe('some \ntext') == text
-    assert utils.safe('some \rtext') == text
+    variants = permutations(('\n', '\r', '\x00'))
+    for variant in variants:
+        seq = ''.join(variant)
+        assert utils.safe(text + seq) == text
+        assert utils.safe(seq + text) == text
+        assert utils.safe('some ' + seq + 'text') == text
+        assert utils.safe(
+            variant[0]
+            + 'some '
+            + variant[1]
+            + 'text'
+            + variant[2]
+        ) == text
 
 
 def test_safe_empty():
@@ -24,20 +30,23 @@ def test_safe_empty():
     assert utils.safe(text) == text
 
 
-def test_safe_null():
+def test_safe_none():
     with pytest.raises(TypeError):
         utils.safe(None)
 
 
 def test_safe_bytes():
     text = b'some text'
-    assert utils.safe(text) == text.decode('utf-8')
-    assert utils.safe(text + b'\r\n') == text.decode('utf-8')
-    assert utils.safe(text + b'\n') == text.decode('utf-8')
-    assert utils.safe(text + b'\r') == text.decode('utf-8')
-    assert utils.safe(b'\r\n' + text) == text.decode('utf-8')
-    assert utils.safe(b'\n' + text) == text.decode('utf-8')
-    assert utils.safe(b'\r' + text) == text.decode('utf-8')
-    assert utils.safe(b'some \r\ntext') == text.decode('utf-8')
-    assert utils.safe(b'some \ntext') == text.decode('utf-8')
-    assert utils.safe(b'some \rtext') == text.decode('utf-8')
+    variants = permutations((b'\n', b'\r', b'\x00'))
+    for variant in variants:
+        seq = b''.join(variant)
+        assert utils.safe(text + seq) == text.decode('utf-8')
+        assert utils.safe(seq + text) == text.decode('utf-8')
+        assert utils.safe(b'some ' + seq + b'text') == text.decode('utf-8')
+        assert utils.safe(
+            variant[0]
+            + b'some '
+            + variant[1]
+            + b'text'
+            + variant[2]
+        ) == text.decode('utf-8')

From 0e4418691ba4a6af1fdf56e6224b61270af8643f Mon Sep 17 00:00:00 2001
From: dgw <dgw@technobabbl.es>
Date: Sun, 15 Sep 2024 14:20:14 -0500
Subject: [PATCH 2/3] test: eliminate for...in loops in test_irc_utils suite

Using `pytest.mark.parametrize()` instead. Two test cases become 12.

Note: Yes, it is possible to have pytest treat each tuple generated by
`itertools.permutations()` as a single argument. Doing so yields useless
generated test-case names like `test_safe[seq0]`, though. Using three
separate test parameters of simple types (`str` or `bytes`) yields more
informative names like `test_safe[\n-\r-\x00]`.
---
 test/irc/test_irc_utils.py | 56 +++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/test/irc/test_irc_utils.py b/test/irc/test_irc_utils.py
index 3204444e20..47f2e3d9a3 100644
--- a/test/irc/test_irc_utils.py
+++ b/test/irc/test_irc_utils.py
@@ -8,21 +8,21 @@
 from sopel.irc import utils
 
 
-def test_safe():
+@pytest.mark.parametrize('s1, s2, s3', permutations(('\n', '\r', '\x00')))
+def test_safe(s1, s2, s3):
     text = 'some text'
-    variants = permutations(('\n', '\r', '\x00'))
-    for variant in variants:
-        seq = ''.join(variant)
-        assert utils.safe(text + seq) == text
-        assert utils.safe(seq + text) == text
-        assert utils.safe('some ' + seq + 'text') == text
-        assert utils.safe(
-            variant[0]
-            + 'some '
-            + variant[1]
-            + 'text'
-            + variant[2]
-        ) == text
+    seq = ''.join((s1, s2, s3))
+
+    assert utils.safe(text + seq) == text
+    assert utils.safe(seq + text) == text
+    assert utils.safe('some ' + seq + 'text') == text
+    assert utils.safe(
+        s1
+        + 'some '
+        + s2
+        + 'text'
+        + s3
+    ) == text
 
 
 def test_safe_empty():
@@ -35,18 +35,18 @@ def test_safe_none():
         utils.safe(None)
 
 
-def test_safe_bytes():
+@pytest.mark.parametrize('b1, b2, b3', permutations((b'\n', b'\r', b'\x00')))
+def test_safe_bytes(b1, b2, b3):
     text = b'some text'
-    variants = permutations((b'\n', b'\r', b'\x00'))
-    for variant in variants:
-        seq = b''.join(variant)
-        assert utils.safe(text + seq) == text.decode('utf-8')
-        assert utils.safe(seq + text) == text.decode('utf-8')
-        assert utils.safe(b'some ' + seq + b'text') == text.decode('utf-8')
-        assert utils.safe(
-            variant[0]
-            + b'some '
-            + variant[1]
-            + b'text'
-            + variant[2]
-        ) == text.decode('utf-8')
+    seq = b''.join((b1, b2, b3))
+
+    assert utils.safe(text + seq) == text.decode('utf-8')
+    assert utils.safe(seq + text) == text.decode('utf-8')
+    assert utils.safe(b'some ' + seq + b'text') == text.decode('utf-8')
+    assert utils.safe(
+        b1
+        + b'some '
+        + b2
+        + b'text'
+        + b3
+    ) == text.decode('utf-8')

From db4b2962a97987c6338a8991623c4669a2a32a7e Mon Sep 17 00:00:00 2001
From: dgw <dgw@technobabbl.es>
Date: Sun, 13 Oct 2024 12:12:04 -0500
Subject: [PATCH 3/3] irc.utils: improve `safe()` docstring

Link to RFC explaining why CR, LF, and NUL are prohibited, and annotate
the version in which NUL stripping was added.
---
 sopel/irc/utils.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/sopel/irc/utils.py b/sopel/irc/utils.py
index 3d3bf450f1..a812f96876 100644
--- a/sopel/irc/utils.py
+++ b/sopel/irc/utils.py
@@ -17,28 +17,32 @@
 from sopel.lifecycle import deprecated
 
 
-def safe(string):
-    """Remove newlines from a string.
+def safe(string: str) -> str:
+    """Remove disallowed bytes from a string, and ensure Unicode.
 
-    :param str string: input text to process
-    :return: the string without newlines
-    :rtype: str
+    :param string: input text to process
+    :return: the string as Unicode without characters prohibited in IRC messages
     :raises TypeError: when ``string`` is ``None``
 
-    This function removes newlines and null-bytes from a string. It will always
+    This function removes newlines and null bytes from a string. It will always
     return a Unicode ``str``, even if given non-Unicode input, but doesn't strip
     or alter the string in any other way::
 
-        >>> safe('some \x00text\\r\\n')
+        >>> safe('some \\x00text\\r\\n')
         'some text'
 
-    This is useful to ensure a string can be used in a IRC message.
+    This is useful to ensure a string can be used in a IRC message. Parameters
+    can **never** contain NUL, CR, or LF octets, per :rfc:`2812#section-2.3.1`.
 
     .. versionchanged:: 7.1
 
         This function now raises a :exc:`TypeError` instead of an unpredictable
         behaviour when given ``None``.
 
+    .. versionchanged:: 8.0.1
+
+        Also remove NUL (``\\x00``) in addition to CR/LF.
+
     """
     if string is None:
         raise TypeError('safe function requires a string, not NoneType')