From 3eb85a0619371ca9a20a131ae9394d40eb949b5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Gia=20Phong?= Date: Thu, 18 Jun 2020 23:10:00 +0700 Subject: [PATCH 1/3] Draft lazy zip over HTTP --- ...727978-e22a-427d-aa03-11ce55d8f6f9.trivial | 0 src/pip/_internal/network/download.py | 27 +-- src/pip/_internal/network/lazy_wheel.py | 204 ++++++++++++++++++ src/pip/_internal/network/utils.py | 23 +- 4 files changed, 228 insertions(+), 26 deletions(-) create mode 100644 news/70727978-e22a-427d-aa03-11ce55d8f6f9.trivial create mode 100644 src/pip/_internal/network/lazy_wheel.py diff --git a/news/70727978-e22a-427d-aa03-11ce55d8f6f9.trivial b/news/70727978-e22a-427d-aa03-11ce55d8f6f9.trivial new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/pip/_internal/network/download.py b/src/pip/_internal/network/download.py index 2f3e08ae62e..7110c8ebdbf 100644 --- a/src/pip/_internal/network/download.py +++ b/src/pip/_internal/network/download.py @@ -11,7 +11,7 @@ from pip._internal.cli.progress_bars import DownloadProgressProvider from pip._internal.models.index import PyPI from pip._internal.network.cache import is_from_cache -from pip._internal.network.utils import response_chunks +from pip._internal.network.utils import HEADERS, response_chunks from pip._internal.utils.misc import ( format_size, redact_auth_from_url, @@ -132,30 +132,7 @@ def _get_http_response_filename(resp, link): def _http_get_download(session, link): # type: (PipSession, Link) -> Response target_url = link.url.split('#', 1)[0] - resp = session.get( - target_url, - # We use Accept-Encoding: identity here because requests - # defaults to accepting compressed responses. This breaks in - # a variety of ways depending on how the server is configured. - # - Some servers will notice that the file isn't a compressible - # file and will leave the file alone and with an empty - # Content-Encoding - # - Some servers will notice that the file is already - # compressed and will leave the file alone and will add a - # Content-Encoding: gzip header - # - Some servers won't notice anything at all and will take - # a file that's already been compressed and compress it again - # and set the Content-Encoding: gzip header - # By setting this to request only the identity encoding We're - # hoping to eliminate the third case. Hopefully there does not - # exist a server which when given a file will notice it is - # already compressed and that you're not asking for a - # compressed file and will then decompress it before sending - # because if that's the case I don't think it'll ever be - # possible to make this work. - headers={"Accept-Encoding": "identity"}, - stream=True, - ) + resp = session.get(target_url, headers=HEADERS, stream=True) resp.raise_for_status() return resp diff --git a/src/pip/_internal/network/lazy_wheel.py b/src/pip/_internal/network/lazy_wheel.py new file mode 100644 index 00000000000..68ad5afcc60 --- /dev/null +++ b/src/pip/_internal/network/lazy_wheel.py @@ -0,0 +1,204 @@ +"""Lazy ZIP over HTTP""" + +__all__ = ['LazyZip'] + +from bisect import bisect_left, bisect_right +from contextlib import contextmanager +from tempfile import NamedTemporaryFile +from zipfile import BadZipfile, ZipFile + +from pip._vendor.requests.models import CONTENT_CHUNK_SIZE +from pip._vendor.six.moves import range + +from pip._internal.network.utils import HEADERS, response_chunks +from pip._internal.utils.typing import MYPY_CHECK_RUNNING + +if MYPY_CHECK_RUNNING: + from typing import Any, Dict, Iterator, List, Optional, Tuple + + from pip._vendor.requests.models import Response + + from pip._internal.network.session import PipSession + + +class LazyZip: + """File-like object mapped to a ZIP file over HTTP. + + This uses HTTP range requests to lazily fetch the file's content, + which is supposed to be fed to ZipFile. + """ + + def __init__(self, session, url, chunk_size=CONTENT_CHUNK_SIZE): + # type: (PipSession, str, int) -> None + head = session.head(url, headers=HEADERS) + head.raise_for_status() + assert head.status_code == 200 + self._session, self._url, self._chunk_size = session, url, chunk_size + self._length = int(head.headers['Content-Length']) + self._file = NamedTemporaryFile() + self.truncate(self._length) + self._left = [] # type: List[int] + self._right = [] # type: List[int] + self._check_zip('bytes' in head.headers.get('Accept-Ranges', 'none')) + + @property + def mode(self): + # type: () -> str + """Opening mode, which is always rb.""" + return 'rb' + + @property + def name(self): + # type: () -> str + """File name.""" + return self._file.name + + def seekable(self): + # type: () -> bool + """Return whether random access is supported, which is True.""" + return True + + def close(self): + # type: () -> None + """Close the file.""" + self._file.close() + + @property + def closed(self): + # type: () -> bool + """Whether the file is closed.""" + return self._file.closed + + def read(self, size=-1): + # type: (int) -> bytes + """Read up to size bytes from the object and return them. + + As a convenience, if size is unspecified or -1, + all bytes until EOF are returned. Fewer than + size bytes may be returned if EOF is reached. + """ + start, length = self.tell(), self._length + stop = start + size if 0 <= size <= length-start else length + self._download(start, stop-1) + return self._file.read(size) + + def readable(self): + # type: () -> bool + """Return whether the file is readable, which is True.""" + return True + + def seek(self, offset, whence=0): + # type: (int, int) -> int + """Change stream position and return the new absolute position. + + Seek to offset relative position indicated by whence: + * 0: Start of stream (the default). pos should be >= 0; + * 1: Current position - pos may be negative; + * 2: End of stream - pos usually negative. + """ + return self._file.seek(offset, whence) + + def tell(self): + # type: () -> int + """Return the current possition.""" + return self._file.tell() + + def truncate(self, size=None): + # type: (Optional[int]) -> int + """Resize the stream to the given size in bytes. + + If size is unspecified resize to the current position. + The current stream position isn't changed. + + Return the new file size. + """ + return self._file.truncate(size) + + def writable(self): + # type: () -> bool + """Return False.""" + return False + + def __enter__(self): + # type: () -> LazyZip + self._file.__enter__() + return self + + def __exit__(self, *exc): + # type: (*Any) -> Optional[bool] + return self._file.__exit__(*exc) + + @contextmanager + def _stay(self): + # type: ()-> Iterator[None] + """Return a context manager keeping the position. + + At the end of the block, seek back to original position. + """ + pos = self.tell() + try: + yield + finally: + self.seek(pos) + + def _check_zip(self, range_request): + # type: (bool) -> None + """Check and download until the file is a valid ZIP.""" + end = self._length - 1 + if not range_request: + self._download(0, end) + return + for start in reversed(range(0, end, self._chunk_size)): + self._download(start, end) + with self._stay(): + try: + # For read-only ZIP files, ZipFile only needs + # methods read, seek, seekable and tell. + # The best way to type-hint in this case is to use + # Python 3.8+ typing.Protocol. + ZipFile(self) # type: ignore + except BadZipfile: + pass + else: + break + + def _stream_response(self, start, end, base_headers=HEADERS): + # type: (int, int, Dict[str, str]) -> Response + """Return HTTP response to a range request from start to end.""" + headers = {'Range': 'bytes={}-{}'.format(start, end)} + headers.update(base_headers) + return self._session.get(self._url, headers=headers, stream=True) + + def _merge(self, start, end, left, right): + # type: (int, int, int, int) -> Iterator[Tuple[int, int]] + """Return an iterator of intervals to be fetched. + + Args: + start (int): Start of needed interval + end (int): End of needed interval + left (int): Index of first overlapping downloaded data + right (int): Index after last overlapping downloaded data + """ + lslice, rslice = self._left[left:right], self._right[left:right] + i = start = min([start]+lslice[:1]) + end = max([end]+rslice[-1:]) + for j, k in zip(lslice, rslice): + if j > i: + yield i, j-1 + i = k + 1 + if i <= end: + yield i, end + self._left[left:right], self._right[left:right] = [start], [end] + + def _download(self, start, end): + # type: (int, int) -> None + """Download bytes from start to end inclusively.""" + with self._stay(): + left = bisect_left(self._right, start) + right = bisect_right(self._left, end) + for start, end in self._merge(start, end, left, right): + response = self._stream_response(start, end) + response.raise_for_status() + self.seek(start) + for chunk in response_chunks(response, self._chunk_size): + self._file.write(chunk) diff --git a/src/pip/_internal/network/utils.py b/src/pip/_internal/network/utils.py index a19050b0f70..412a3ca1632 100644 --- a/src/pip/_internal/network/utils.py +++ b/src/pip/_internal/network/utils.py @@ -3,7 +3,28 @@ from pip._internal.utils.typing import MYPY_CHECK_RUNNING if MYPY_CHECK_RUNNING: - from typing import Iterator + from typing import Dict, Iterator + +# The following comments and HTTP headers were originally added by +# Donald Stufft in git commit 22c562429a61bb77172039e480873fb239dd8c03. +# +# We use Accept-Encoding: identity here because requests defaults to +# accepting compressed responses. This breaks in a variety of ways +# depending on how the server is configured. +# - Some servers will notice that the file isn't a compressible file +# and will leave the file alone and with an empty Content-Encoding +# - Some servers will notice that the file is already compressed and +# will leave the file alone, adding a Content-Encoding: gzip header +# - Some servers won't notice anything at all and will take a file +# that's already been compressed and compress it again, and set +# the Content-Encoding: gzip header +# By setting this to request only the identity encoding we're hoping +# to eliminate the third case. Hopefully there does not exist a server +# which when given a file will notice it is already compressed and that +# you're not asking for a compressed file and will then decompress it +# before sending because if that's the case I don't think it'll ever be +# possible to make this work. +HEADERS = {'Accept-Encoding': 'identity'} # type: Dict[str, str] def response_chunks(response, chunk_size=CONTENT_CHUNK_SIZE): From e1438d06b522c0f6ce9bb403ddd701bc82382df7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Gia=20Phong?= Date: Wed, 24 Jun 2020 16:30:48 +0700 Subject: [PATCH 2/3] Rename and wrap LazyZipOverHTTP --- src/pip/_internal/network/lazy_wheel.py | 47 +++++++++++++++++-------- 1 file changed, 32 insertions(+), 15 deletions(-) diff --git a/src/pip/_internal/network/lazy_wheel.py b/src/pip/_internal/network/lazy_wheel.py index 68ad5afcc60..d7b8bcc21ac 100644 --- a/src/pip/_internal/network/lazy_wheel.py +++ b/src/pip/_internal/network/lazy_wheel.py @@ -1,6 +1,6 @@ """Lazy ZIP over HTTP""" -__all__ = ['LazyZip'] +__all__ = ['dist_from_wheel_url'] from bisect import bisect_left, bisect_right from contextlib import contextmanager @@ -12,24 +12,44 @@ from pip._internal.network.utils import HEADERS, response_chunks from pip._internal.utils.typing import MYPY_CHECK_RUNNING +from pip._internal.utils.wheel import pkg_resources_distribution_for_wheel if MYPY_CHECK_RUNNING: from typing import Any, Dict, Iterator, List, Optional, Tuple + from pip._vendor.pkg_resources import Distribution from pip._vendor.requests.models import Response from pip._internal.network.session import PipSession -class LazyZip: +def dist_from_wheel_url(name, url, session): + # type: (str, str, PipSession) -> Distribution + """Return a pkg_resources.Distribution from the given wheel URL. + + This uses HTTP range requests to only fetch the potion of the wheel + containing metadata, just enough for the object to be constructed. + If such requests are not supported, RuntimeError is raised. + """ + with LazyZipOverHTTP(url, session) as wheel: + # For read-only ZIP files, ZipFile only needs methods read, + # seek, seekable and tell, not the whole IO protocol. + zip_file = ZipFile(wheel) # type: ignore + # After context manager exit, wheel.name + # is an invalid file by intention. + return pkg_resources_distribution_for_wheel(zip_file, name, wheel.name) + + +class LazyZipOverHTTP(object): """File-like object mapped to a ZIP file over HTTP. This uses HTTP range requests to lazily fetch the file's content, - which is supposed to be fed to ZipFile. + which is supposed to be fed to ZipFile. If such requests are not + supported by the server, raise RuntimeError during initialization. """ - def __init__(self, session, url, chunk_size=CONTENT_CHUNK_SIZE): - # type: (PipSession, str, int) -> None + def __init__(self, url, session, chunk_size=CONTENT_CHUNK_SIZE): + # type: (str, PipSession, int) -> None head = session.head(url, headers=HEADERS) head.raise_for_status() assert head.status_code == 200 @@ -39,7 +59,9 @@ def __init__(self, session, url, chunk_size=CONTENT_CHUNK_SIZE): self.truncate(self._length) self._left = [] # type: List[int] self._right = [] # type: List[int] - self._check_zip('bytes' in head.headers.get('Accept-Ranges', 'none')) + if 'bytes' not in head.headers.get('Accept-Ranges', 'none'): + raise RuntimeError('range request is not supported') + self._check_zip() @property def mode(self): @@ -50,7 +72,7 @@ def mode(self): @property def name(self): # type: () -> str - """File name.""" + """Path to the underlying file.""" return self._file.name def seekable(self): @@ -120,7 +142,7 @@ def writable(self): return False def __enter__(self): - # type: () -> LazyZip + # type: () -> LazyZipOverHTTP self._file.__enter__() return self @@ -141,21 +163,16 @@ def _stay(self): finally: self.seek(pos) - def _check_zip(self, range_request): - # type: (bool) -> None + def _check_zip(self): + # type: () -> None """Check and download until the file is a valid ZIP.""" end = self._length - 1 - if not range_request: - self._download(0, end) - return for start in reversed(range(0, end, self._chunk_size)): self._download(start, end) with self._stay(): try: # For read-only ZIP files, ZipFile only needs # methods read, seek, seekable and tell. - # The best way to type-hint in this case is to use - # Python 3.8+ typing.Protocol. ZipFile(self) # type: ignore except BadZipfile: pass From 25a25a0975841ce319790d8047999876ecb1188b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Gia=20Phong?= Date: Fri, 26 Jun 2020 15:15:18 +0700 Subject: [PATCH 3/3] Test network.lazy_wheel.dist_from_wheel_url --- tests/lib/requests_mocks.py | 2 +- tests/unit/test_network_lazy_wheel.py | 50 +++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 tests/unit/test_network_lazy_wheel.py diff --git a/tests/lib/requests_mocks.py b/tests/lib/requests_mocks.py index baaf77ecc25..41c30eafd9c 100644 --- a/tests/lib/requests_mocks.py +++ b/tests/lib/requests_mocks.py @@ -28,7 +28,7 @@ def __init__(self, contents): self.status_code = 200 self.connection = None self.url = None - self.headers = {} + self.headers = {'Content-Length': len(contents)} self.history = [] def raise_for_status(self): diff --git a/tests/unit/test_network_lazy_wheel.py b/tests/unit/test_network_lazy_wheel.py new file mode 100644 index 00000000000..694d126859f --- /dev/null +++ b/tests/unit/test_network_lazy_wheel.py @@ -0,0 +1,50 @@ +from zipfile import BadZipfile + +from pip._vendor.pkg_resources import Requirement +from pytest import fixture, mark, raises + +from pip._internal.network.lazy_wheel import dist_from_wheel_url +from pip._internal.network.session import PipSession +from tests.lib.requests_mocks import MockResponse + +MYPY_0_782_WHL = ( + 'https://files.pythonhosted.org/packages/9d/65/' + 'b96e844150ce18b9892b155b780248955ded13a2581d31872e7daa90a503/' + 'mypy-0.782-py3-none-any.whl' +) +MYPY_0_782_REQS = { + Requirement('typed-ast (<1.5.0,>=1.4.0)'), + Requirement('typing-extensions (>=3.7.4)'), + Requirement('mypy-extensions (<0.5.0,>=0.4.3)'), + Requirement('psutil (>=4.0); extra == "dmypy"'), +} + + +@fixture +def session(): + return PipSession() + + +@mark.network +def test_dist_from_wheel_url(session): + """Test if the acquired distribution contain correct information.""" + dist = dist_from_wheel_url('mypy', MYPY_0_782_WHL, session) + assert dist.key == 'mypy' + assert dist.version == '0.782' + assert dist.extras == ['dmypy'] + assert set(dist.requires(dist.extras)) == MYPY_0_782_REQS + + +@mark.network +def test_dist_from_wheel_url_no_range(session, monkeypatch): + """Test handling when HTTP range requests are not supported.""" + monkeypatch.setattr(session, 'head', lambda *a, **kw: MockResponse(b'')) + with raises(RuntimeError): + dist_from_wheel_url('mypy', MYPY_0_782_WHL, session) + + +@mark.network +def test_dist_from_wheel_url_not_zip(session): + """Test handling with the given URL does not point to a ZIP.""" + with raises(BadZipfile): + dist_from_wheel_url('python', 'https://www.python.org/', session)