Skip to content

Commit

Permalink
Fix utils.encoding.auto_decode() LookupError with invalid encodings
Browse files Browse the repository at this point in the history
utils.encoding.auto_decode() was broken when decoding Big Endian BOM
byte-strings on Little Endian or vice versa.

The TestEncoding.test_auto_decode_utf_16_le test was failing on Big Endian
systems, such as Fedora's s390x builders. A similar test, but with BE BOM
test_auto_decode_utf_16_be was added in order to reproduce this on a Little
Endian system (which is much easier to come by).

A regression test was added to check that all listed encodings in
utils.encoding.BOMS are valid.

Fixes #6054
  • Loading branch information
hroncok committed Mar 1, 2019
1 parent 729404d commit d48475d
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 9 deletions.
4 changes: 4 additions & 0 deletions news/6054.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Fix ``utils.encoding.auto_decode()`` ``LookupError`` with invalid encodings.
``utils.encoding.auto_decode()`` was broken when decoding Big Endian BOM
byte-strings on Little Endian or vice versa.

14 changes: 7 additions & 7 deletions src/pip/_internal/utils/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@
from typing import List, Tuple, Text

BOMS = [
(codecs.BOM_UTF8, 'utf8'),
(codecs.BOM_UTF16, 'utf16'),
(codecs.BOM_UTF16_BE, 'utf16-be'),
(codecs.BOM_UTF16_LE, 'utf16-le'),
(codecs.BOM_UTF32, 'utf32'),
(codecs.BOM_UTF32_BE, 'utf32-be'),
(codecs.BOM_UTF32_LE, 'utf32-le'),
(codecs.BOM_UTF8, 'utf-8'),
(codecs.BOM_UTF16, 'utf-16'),
(codecs.BOM_UTF16_BE, 'utf-16-be'),
(codecs.BOM_UTF16_LE, 'utf-16-le'),
(codecs.BOM_UTF32, 'utf-32'),
(codecs.BOM_UTF32_BE, 'utf-32-be'),
(codecs.BOM_UTF32_LE, 'utf-32-le'),
] # type: List[Tuple[bytes, Text]]

ENCODING_RE = re.compile(br'coding[:=]\s*([-\w.]+)')
Expand Down
19 changes: 17 additions & 2 deletions tests/unit/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
util tests
"""
import codecs
import itertools
import os
import shutil
Expand All @@ -20,7 +21,7 @@
from pip._internal.exceptions import (
HashMismatch, HashMissing, InstallationError, UnsupportedPythonVersion,
)
from pip._internal.utils.encoding import auto_decode
from pip._internal.utils.encoding import BOMS, auto_decode
from pip._internal.utils.glibc import check_glibc_version
from pip._internal.utils.hashes import Hashes, MissingHashes
from pip._internal.utils.misc import (
Expand Down Expand Up @@ -462,11 +463,20 @@ def test_non_zero(self):
class TestEncoding(object):
"""Tests for pip._internal.utils.encoding"""

def test_auto_decode_utf16_le(self):
def test_auto_decode_utf_16_le(self):
data = (
b'\xff\xfeD\x00j\x00a\x00n\x00g\x00o\x00=\x00'
b'=\x001\x00.\x004\x00.\x002\x00'
)
assert data.startswith(codecs.BOM_UTF16_LE)
assert auto_decode(data) == "Django==1.4.2"

def test_auto_decode_utf_16_be(self):
data = (
b'\xfe\xff\x00D\x00j\x00a\x00n\x00g\x00o\x00='
b'\x00=\x001\x00.\x004\x00.\x002'
)
assert data.startswith(codecs.BOM_UTF16_BE)
assert auto_decode(data) == "Django==1.4.2"

def test_auto_decode_no_bom(self):
Expand All @@ -486,6 +496,11 @@ def test_auto_decode_no_preferred_encoding(self):
ret = auto_decode(data.encode(sys.getdefaultencoding()))
assert ret == data

@pytest.mark.parametrize('encoding', [encoding for bom, encoding in BOMS])
def test_all_encodings_are_valid(self, encoding):
# we really only care that there is no LookupError
assert ''.encode(encoding).decode(encoding) == ''


class TestTempDirectory(object):

Expand Down

0 comments on commit d48475d

Please sign in to comment.