Fix utils.encoding.auto_decode() LookupError with invalid encodings

utils.encoding.auto_decode() was broken when decoding Big Endian BOM byte-strings on Little Endian or vice versa. The TestEncoding.test_auto_decode_utf_16_le test was failing on Big Endian systems, such as Fedora's s390x builders. A similar test, but with BE BOM test_auto_decode_utf_16_be was added in order to reproduce this on a Little Endian system (which is much easier to come by). A regression test was added to check that all listed encodings in utils.encoding.BOMS are valid. Fixes #6054
pypa · Mar 1, 2019 · d48475d · d48475d
1 parent 729404d
commit d48475d
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 9 deletions.
diff --git a/news/6054.bugfix b/news/6054.bugfix
@@ -0,0 +1,4 @@
+Fix ``utils.encoding.auto_decode()`` ``LookupError`` with invalid encodings.
+``utils.encoding.auto_decode()`` was broken when decoding Big Endian BOM
+byte-strings on Little Endian or vice versa.
+
diff --git a/src/pip/_internal/utils/encoding.py b/src/pip/_internal/utils/encoding.py
@@ -9,13 +9,13 @@
     from typing import List, Tuple, Text
 
 BOMS = [
-    (codecs.BOM_UTF8, 'utf8'),
-    (codecs.BOM_UTF16, 'utf16'),
-    (codecs.BOM_UTF16_BE, 'utf16-be'),
-    (codecs.BOM_UTF16_LE, 'utf16-le'),
-    (codecs.BOM_UTF32, 'utf32'),
-    (codecs.BOM_UTF32_BE, 'utf32-be'),
-    (codecs.BOM_UTF32_LE, 'utf32-le'),
+    (codecs.BOM_UTF8, 'utf-8'),
+    (codecs.BOM_UTF16, 'utf-16'),
+    (codecs.BOM_UTF16_BE, 'utf-16-be'),
+    (codecs.BOM_UTF16_LE, 'utf-16-le'),
+    (codecs.BOM_UTF32, 'utf-32'),
+    (codecs.BOM_UTF32_BE, 'utf-32-be'),
+    (codecs.BOM_UTF32_LE, 'utf-32-le'),
 ]  # type: List[Tuple[bytes, Text]]
 
 ENCODING_RE = re.compile(br'coding[:=]\s*([-\w.]+)')

diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
@@ -4,6 +4,7 @@
 util tests
 
 """
+import codecs
 import itertools
 import os
 import shutil
@@ -20,7 +21,7 @@
 from pip._internal.exceptions import (
     HashMismatch, HashMissing, InstallationError, UnsupportedPythonVersion,
 )
-from pip._internal.utils.encoding import auto_decode
+from pip._internal.utils.encoding import BOMS, auto_decode
 from pip._internal.utils.glibc import check_glibc_version
 from pip._internal.utils.hashes import Hashes, MissingHashes
 from pip._internal.utils.misc import (
@@ -462,11 +463,20 @@ def test_non_zero(self):
 class TestEncoding(object):
     """Tests for pip._internal.utils.encoding"""
 
-    def test_auto_decode_utf16_le(self):
+    def test_auto_decode_utf_16_le(self):
         data = (
             b'\xff\xfeD\x00j\x00a\x00n\x00g\x00o\x00=\x00'
             b'=\x001\x00.\x004\x00.\x002\x00'
         )
+        assert data.startswith(codecs.BOM_UTF16_LE)
+        assert auto_decode(data) == "Django==1.4.2"
+
+    def test_auto_decode_utf_16_be(self):
+        data = (
+            b'\xfe\xff\x00D\x00j\x00a\x00n\x00g\x00o\x00='
+            b'\x00=\x001\x00.\x004\x00.\x002'
+        )
+        assert data.startswith(codecs.BOM_UTF16_BE)
         assert auto_decode(data) == "Django==1.4.2"
 
     def test_auto_decode_no_bom(self):
@@ -486,6 +496,11 @@ def test_auto_decode_no_preferred_encoding(self):
                 ret = auto_decode(data.encode(sys.getdefaultencoding()))
         assert ret == data
 
+    @pytest.mark.parametrize('encoding', [encoding for bom, encoding in BOMS])
+    def test_all_encodings_are_valid(self, encoding):
+        # we really only care that there is no LookupError
+        assert ''.encode(encoding).decode(encoding) == ''
+
 
 class TestTempDirectory(object):