logic: do not decode bytes in tuple

Fix the logic mistakenly decoded input domain name passed as tuple of bytes, fixes issue #31. Added some test cases and mentioning this in README. Before this, the tuple of bytes input matches the PSL if the bytes are valid UTF-8. ```python psl = PublicSuffixList("例.example") psl.publicsuffix("例.example") # "例.example" psl.publicsuffix("xn--fsq.example") # "xn--fsq.example" psl.publicsuffix((b"xn--fsq", b"example")) # (b"xn--fsq", b"example") psl.publicsuffix((b"\xe4\xbe\x8b", b"example")) # (b"\xe4\xbe\x8b", b"example") ``` Expected behavior should be: ```python psl.publicsuffix((b"\xe4\xbe\x8b", b"example")) # (b"example",) ``` The last case should not match in its entirety since the bytes object does not contain its encoding information. We should evaluate the binary input as-is, except for the ASCII case conversion defined in the evaluation rule. This can be problematic if the encoding of arbitrary input cannot be enforced and/or the input must be decoded from bytes to str using punycode. Assuming UTF-8 is incorrect in this context. In cases where evaluating binary as UTF-8 is required, the callers should re-encode the input to punycoded bytes tuples, or use a scalar str. Signed-off-by: ko-zu <[email protected]>
ko-zu · Jun 21, 2024 · 7e492aa · 7e492aa
1 parent a942231
commit 7e492aa
Show file tree

Hide file tree

Showing 3 changed files with 46 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -107,6 +107,12 @@ cannot be decoded or represented as a standard domain name.
 Example:
 ```python
 psl.privatesuffix((b"a.a", b"a.example\xff", b"com"))  # (b"a.example\xff", b"com")
+
+# Note that IDNs must be punycoded when passed as tuple of bytes.
+psl = PublicSuffixList("例.example")
+psl.publicsuffix((b"xn--fsq", b"example"))  # (b"xn--fsq", b"example")
+# UTF-8 encoded bytes of "例" do not match.
+psl.publicsuffix((b"\xe4\xbe\x8b", b"example"))  # (b"example",)
 ```
 
 License

diff --git a/publicsuffixlist/__init__.py b/publicsuffixlist/__init__.py
@@ -143,7 +143,7 @@ def _preparedomain(self, domain) -> Union[Tuple[str, Labels], Tuple[BytesTuple,
 
         elif isinstance(domain, iterable):
             domain = tuple(bytes(x) for x in domain)
-            labels = tuple(str(x, ENCODING, ERRORMODE).lower()
+            labels = tuple(str(x, "ascii", ERRORMODE).lower()
                            for x in domain)
         else:
             raise TypeError("Only str, Iter[ByteString] are supported.")

diff --git a/publicsuffixlist/test.py b/publicsuffixlist/test.py
@@ -208,6 +208,45 @@ def test_bytestuple(self):
         self.assertEqual(psl.publicsuffix(data), pubres)
         self.assertEqual(psl.privatesuffix(data), privres)
 
+    def test_bytestuple_punycode(self):
+        source = """
+example
+例.example
+"""
+        psl = PublicSuffixList(source)
+        # punycoded ASCII should match
+        data = bytestuple("aaa.www.例.example".encode("idna"))
+        pubres  = data[-2:] # xn--fsq.example
+        privres = data[-3:]
+        self.assertEqual(psl.publicsuffix(data), pubres)
+        self.assertEqual(psl.privatesuffix(data), privres)
+
+    def test_bytestuple_utf8(self):
+        source = """
+example
+例.example
+"""
+        psl = PublicSuffixList(source)
+        # UTF-8 encoded bytes should NOT match
+        data = bytestuple("aaa.www.例.example".encode("utf8"))
+        pubres  = data[-1:] # example
+        privres = data[-2:]
+        self.assertEqual(psl.publicsuffix(data), pubres)
+        self.assertEqual(psl.privatesuffix(data), privres)
+
+    def test_bytestuple_otherencoding(self):
+        source = """
+example
+例.example
+"""
+        psl = PublicSuffixList(source.splitlines())
+        # Shift_JIS encoded bytes should NOT match
+        data = bytestuple("aaa.www.例.example".encode("sjis"))
+        pubres  = data[-1:] # example
+        privres = data[-2:]
+        self.assertEqual(psl.publicsuffix(data), pubres)
+        self.assertEqual(psl.privatesuffix(data), privres)
+
     def test_bytestuple_empty(self):
         psl = self.psl
         self.assertEqual(psl.publicsuffix(()), None)