From 3fe8eb2838b8a870bea67aca009b4e1134c50859 Mon Sep 17 00:00:00 2001 From: Marat Sharafutdinov Date: Thu, 23 Nov 2017 15:52:31 +0300 Subject: [PATCH 1/6] Add the processing of invalid charsets while detecting content encoding --- aiohttp/client_reqrep.py | 6 ++++++ tests/test_client_response.py | 20 ++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/aiohttp/client_reqrep.py b/aiohttp/client_reqrep.py index 35695de98cc..00890c4bb7b 100644 --- a/aiohttp/client_reqrep.py +++ b/aiohttp/client_reqrep.py @@ -1,4 +1,5 @@ import asyncio +import codecs import collections import io import json @@ -761,6 +762,11 @@ def _get_encoding(self): mimetype = helpers.parse_mimetype(ctype) encoding = mimetype.parameters.get('charset') + if encoding: + try: + codecs.lookup(encoding) + except LookupError: + encoding = None if not encoding: if mimetype.type == 'application' and mimetype.subtype == 'json': # RFC 7159 states that the default encoding is UTF-8. diff --git a/tests/test_client_response.py b/tests/test_client_response.py index 4765265e058..76d12c9da27 100644 --- a/tests/test_client_response.py +++ b/tests/test_client_response.py @@ -283,6 +283,26 @@ def side_effect(*args, **kwargs): assert response._connection is None +async def test_text_detect_encoding_if_invalid_charset(loop, session): + response = ClientResponse('get', URL('http://def-cl-resp.org')) + response._post_init(loop, session) + + def side_effect(*args, **kwargs): + fut = loop.create_future() + fut.set_result('{"тест": "пройден"}'.encode('cp1251')) + return fut + + response.headers = {'Content-Type': 'text/plain;charset=invalid'} + content = response.content = mock.Mock() + content.read.side_effect = side_effect + + await response.read() + res = await response.text() + assert res == '{"тест": "пройден"}' + assert response._connection is None + assert response._get_encoding() == 'windows-1251' + + async def test_text_after_read(loop, session): response = ClientResponse('get', URL('http://def-cl-resp.org')) response._post_init(loop, session) From 14d2e4aefe1fe6c04d5b23d7de3d59d7bff7ef9d Mon Sep 17 00:00:00 2001 From: Marat Sharafutdinov Date: Thu, 23 Nov 2017 15:53:26 +0300 Subject: [PATCH 2/6] Make the `aiohttp.ClientResponse.get_encoding` method public --- aiohttp/client_reqrep.py | 6 +++--- tests/test_client_response.py | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/aiohttp/client_reqrep.py b/aiohttp/client_reqrep.py index 00890c4bb7b..d6429d6531e 100644 --- a/aiohttp/client_reqrep.py +++ b/aiohttp/client_reqrep.py @@ -757,7 +757,7 @@ async def read(self): return self._content - def _get_encoding(self): + def get_encoding(self): ctype = self.headers.get(hdrs.CONTENT_TYPE, '').lower() mimetype = helpers.parse_mimetype(ctype) @@ -784,7 +784,7 @@ async def text(self, encoding=None, errors='strict'): await self.read() if encoding is None: - encoding = self._get_encoding() + encoding = self.get_encoding() return self._content.decode(encoding, errors=errors) @@ -809,7 +809,7 @@ async def json(self, *, encoding=None, loads=json.loads, return None if encoding is None: - encoding = self._get_encoding() + encoding = self.get_encoding() return loads(stripped.decode(encoding)) diff --git a/tests/test_client_response.py b/tests/test_client_response.py index 76d12c9da27..833bf0aacbf 100644 --- a/tests/test_client_response.py +++ b/tests/test_client_response.py @@ -256,12 +256,12 @@ def side_effect(*args, **kwargs): 'Content-Type': 'application/json'} content = response.content = mock.Mock() content.read.side_effect = side_effect - response._get_encoding = mock.Mock() + response.get_encoding = mock.Mock() res = await response.text(encoding='cp1251') assert res == '{"тест": "пройден"}' assert response._connection is None - assert not response._get_encoding.called + assert not response.get_encoding.called async def test_text_detect_encoding(loop, session): @@ -300,7 +300,7 @@ def side_effect(*args, **kwargs): res = await response.text() assert res == '{"тест": "пройден"}' assert response._connection is None - assert response._get_encoding() == 'windows-1251' + assert response.get_encoding() == 'windows-1251' async def test_text_after_read(loop, session): @@ -392,12 +392,12 @@ def side_effect(*args, **kwargs): 'Content-Type': 'application/json;charset=utf8'} content = response.content = mock.Mock() content.read.side_effect = side_effect - response._get_encoding = mock.Mock() + response.get_encoding = mock.Mock() res = await response.json(encoding='cp1251') assert res == {'тест': 'пройден'} assert response._connection is None - assert not response._get_encoding.called + assert not response.get_encoding.called @pytest.mark.xfail @@ -418,7 +418,7 @@ def test_get_encoding_unknown(loop, session): response.headers = {'Content-Type': 'application/json'} with mock.patch('aiohttp.client_reqrep.chardet') as m_chardet: m_chardet.detect.return_value = {'encoding': None} - assert response._get_encoding() == 'utf-8' + assert response.get_encoding() == 'utf-8' def test_raise_for_status_2xx(): From fcaa3036d3c3d0720a4f84a0e4556f7e646bfb70 Mon Sep 17 00:00:00 2001 From: Marat Sharafutdinov Date: Thu, 23 Nov 2017 15:57:36 +0300 Subject: [PATCH 3/6] Add docs --- CHANGES/2549.feature | 2 ++ CONTRIBUTORS.txt | 1 + docs/client_reference.rst | 6 ++++++ 3 files changed, 9 insertions(+) create mode 100644 CHANGES/2549.feature diff --git a/CHANGES/2549.feature b/CHANGES/2549.feature new file mode 100644 index 00000000000..1d3e60a47f7 --- /dev/null +++ b/CHANGES/2549.feature @@ -0,0 +1,2 @@ +Make the `aiohttp.ClientResponse.get_encoding` method public with +the processing of invalid charsets while detecting content encoding. diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt index 6a50505ef96..51f79744bc4 100644 --- a/CONTRIBUTORS.txt +++ b/CONTRIBUTORS.txt @@ -122,6 +122,7 @@ Ludovic Gasc Lukasz Marcin Dobrzanski Makc Belousow Manuel Miranda +Marat Sharafutdinov Marco Paolini Mariano Anaya Martin Melka diff --git a/docs/client_reference.rst b/docs/client_reference.rst index a91c9241424..54334577f7b 100644 --- a/docs/client_reference.rst +++ b/docs/client_reference.rst @@ -1162,6 +1162,12 @@ Response object A namedtuple with request URL and headers from :class:`ClientRequest` object, :class:`aiohttp.RequestInfo` instance. + .. method:: get_encoding() + + Automatically detect content encoding using ``charset`` info in + ``Content-Type`` HTTP header. If this info is not exists or there + are no appropriate codecs for encoding then :term:`cchardet` / + :term:`chardet` is used. ClientWebSocketResponse From 90a33a235d93ffaf4e15d6a888e2d595602ce49b Mon Sep 17 00:00:00 2001 From: Marat Sharafutdinov Date: Thu, 23 Nov 2017 16:11:26 +0300 Subject: [PATCH 4/6] Fix tests --- tests/test_client_response.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_client_response.py b/tests/test_client_response.py index 833bf0aacbf..602691cc5ab 100644 --- a/tests/test_client_response.py +++ b/tests/test_client_response.py @@ -300,7 +300,7 @@ def side_effect(*args, **kwargs): res = await response.text() assert res == '{"тест": "пройден"}' assert response._connection is None - assert response.get_encoding() == 'windows-1251' + assert response.get_encoding().lower() == 'windows-1251' async def test_text_after_read(loop, session): From 994b4f6cfc36d12bde67de415c87ca22fb7a7028 Mon Sep 17 00:00:00 2001 From: Marat Sharafutdinov Date: Thu, 23 Nov 2017 16:30:11 +0300 Subject: [PATCH 5/6] Fix change description --- CHANGES/2549.feature | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES/2549.feature b/CHANGES/2549.feature index 1d3e60a47f7..680d8a1e63b 100644 --- a/CHANGES/2549.feature +++ b/CHANGES/2549.feature @@ -1,2 +1,2 @@ Make the `aiohttp.ClientResponse.get_encoding` method public with -the processing of invalid charsets while detecting content encoding. +the processing of invalid charset while detecting content encoding. From 325209c0bfefd56fd00ee4ab878c5b1d0c361e2a Mon Sep 17 00:00:00 2001 From: Andrew Svetlov Date: Thu, 23 Nov 2017 19:00:55 +0200 Subject: [PATCH 6/6] Update client_reference.rst --- docs/client_reference.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/client_reference.rst b/docs/client_reference.rst index 54334577f7b..572a3567229 100644 --- a/docs/client_reference.rst +++ b/docs/client_reference.rst @@ -1168,6 +1168,8 @@ Response object ``Content-Type`` HTTP header. If this info is not exists or there are no appropriate codecs for encoding then :term:`cchardet` / :term:`chardet` is used. + + .. versionadded:: 3.0 ClientWebSocketResponse