From f474a8a2d5a0bba0d95981a66c471359af55c145 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Randy=20D=C3=B6ring?= <30527984+radoering@users.noreply.github.com> Date: Sat, 16 Dec 2023 12:08:14 +0100 Subject: [PATCH] feature: add new "lazy-wheel" config option If active and a server supports range requests, we do not have to download entire wheels to fetch metadata but can just download the METADATA file from the remote wheel with a couple of range requests. --- docs/configuration.md | 17 + src/poetry/config/config.py | 4 + src/poetry/console/commands/config.py | 3 +- src/poetry/inspection/info.py | 22 +- src/poetry/inspection/lazy_wheel.py | 762 ++++++++++++++++++ src/poetry/repositories/http_repository.py | 70 +- src/poetry/utils/authenticator.py | 4 + src/poetry/utils/helpers.py | 13 + tests/console/commands/test_config.py | 25 + ...ssing_dist_info-0.1.0-py2.py3-none-any.whl | Bin 0 -> 310 bytes tests/helpers.py | 10 +- tests/inspection/conftest.py | 146 ++++ tests/inspection/test_info.py | 24 + tests/inspection/test_lazy_wheel.py | 188 +++++ tests/repositories/test_http_repository.py | 144 ++++ tests/repositories/test_legacy_repository.py | 5 +- tests/repositories/test_pypi_repository.py | 5 +- .../test_single_page_repository.py | 5 +- tests/utils/test_helpers.py | 35 + 19 files changed, 1464 insertions(+), 18 deletions(-) create mode 100644 src/poetry/inspection/lazy_wheel.py create mode 100644 tests/fixtures/distributions/demo_missing_dist_info-0.1.0-py2.py3-none-any.whl create mode 100644 tests/inspection/conftest.py create mode 100644 tests/inspection/test_lazy_wheel.py create mode 100644 tests/repositories/test_http_repository.py diff --git a/docs/configuration.md b/docs/configuration.md index 384140b4162..67a0301c3cd 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -271,6 +271,23 @@ across all your projects if incorrectly set. Use parallel execution when using the new (`>=1.1.0`) installer. +### `solver.lazy-wheel` + +**Type**: `boolean` + +**Default**: `true` + +**Environment Variable**: `POETRY_SOLVER_LAZY_WHEEL` + +*Introduced in 1.8.0* + +Do not download entire wheels to extract metadata but use +[HTTP range requests](https://developer.mozilla.org/en-US/docs/Web/HTTP/Range_requests) +to only download the METADATA files of wheels. +Especially with slow network connections this setting can speed up dependency resolution significantly. +If the cache has already been filled or the server does not support HTTP range requests, +this setting makes no difference. + ### `virtualenvs.create` **Type**: `boolean` diff --git a/src/poetry/config/config.py b/src/poetry/config/config.py index 9f0ecaec792..5bafb90218b 100644 --- a/src/poetry/config/config.py +++ b/src/poetry/config/config.py @@ -131,6 +131,9 @@ class Config: "max-workers": None, "no-binary": None, }, + "solver": { + "lazy-wheel": True, + }, "warnings": { "export": True, }, @@ -296,6 +299,7 @@ def _get_normalizer(name: str) -> Callable[[str], Any]: "experimental.system-git-client", "installer.modern-installation", "installer.parallel", + "solver.lazy-wheel", "warnings.export", }: return boolean_normalizer diff --git a/src/poetry/console/commands/config.py b/src/poetry/console/commands/config.py index 3b460fb7aa2..ca5880a01e2 100644 --- a/src/poetry/console/commands/config.py +++ b/src/poetry/console/commands/config.py @@ -68,15 +68,16 @@ def unique_config_values(self) -> dict[str, tuple[Any, Any]]: ), "virtualenvs.path": (str, lambda val: str(Path(val))), "virtualenvs.prefer-active-python": (boolean_validator, boolean_normalizer), + "virtualenvs.prompt": (str, lambda val: str(val)), "experimental.system-git-client": (boolean_validator, boolean_normalizer), "installer.modern-installation": (boolean_validator, boolean_normalizer), "installer.parallel": (boolean_validator, boolean_normalizer), "installer.max-workers": (lambda val: int(val) > 0, int_normalizer), - "virtualenvs.prompt": (str, lambda val: str(val)), "installer.no-binary": ( PackageFilterPolicy.validator, PackageFilterPolicy.normalize, ), + "solver.lazy-wheel": (boolean_validator, boolean_normalizer), "warnings.export": (boolean_validator, boolean_normalizer), } diff --git a/src/poetry/inspection/info.py b/src/poetry/inspection/info.py index 3edb225c8b5..38c1b40ecab 100644 --- a/src/poetry/inspection/info.py +++ b/src/poetry/inspection/info.py @@ -32,6 +32,8 @@ from poetry.core.packages.project_package import ProjectPackage + from poetry.inspection.lazy_wheel import MemoryWheel + logger = logging.getLogger(__name__) @@ -231,9 +233,7 @@ def to_package( return package @classmethod - def _from_distribution( - cls, dist: pkginfo.BDist | pkginfo.SDist | pkginfo.Wheel - ) -> PackageInfo: + def _from_distribution(cls, dist: pkginfo.Distribution) -> PackageInfo: """ Helper method to parse package information from a `pkginfo.Distribution` instance. @@ -244,7 +244,7 @@ def _from_distribution( if dist.requires_dist: requirements = list(dist.requires_dist) - else: + elif isinstance(dist, (pkginfo.BDist, pkginfo.SDist, pkginfo.Wheel)): requires = Path(dist.filename) / "requires.txt" if requires.exists(): text = requires.read_text(encoding="utf-8") @@ -258,8 +258,9 @@ def _from_distribution( requires_python=dist.requires_python, ) - info._source_type = "file" - info._source_url = Path(dist.filename).resolve().as_posix() + if isinstance(dist, (pkginfo.BDist, pkginfo.SDist, pkginfo.Wheel)): + info._source_type = "file" + info._source_url = Path(dist.filename).resolve().as_posix() return info @@ -522,6 +523,15 @@ def from_wheel(cls, path: Path) -> PackageInfo: except ValueError: return PackageInfo() + @classmethod + def from_memory_wheel(cls, wheel: MemoryWheel) -> PackageInfo: + """ + Gather package information from a partial fetched wheel kept in memory. + + :param path: Path to wheel. + """ + return cls._from_distribution(wheel) + @classmethod def from_bdist(cls, path: Path) -> PackageInfo: """ diff --git a/src/poetry/inspection/lazy_wheel.py b/src/poetry/inspection/lazy_wheel.py new file mode 100644 index 00000000000..7542ed3abe0 --- /dev/null +++ b/src/poetry/inspection/lazy_wheel.py @@ -0,0 +1,762 @@ +"""Lazy ZIP over HTTP""" + +from __future__ import annotations + +import abc +import io +import logging +import re + +from bisect import bisect_left +from bisect import bisect_right +from contextlib import contextmanager +from tempfile import NamedTemporaryFile +from typing import TYPE_CHECKING +from typing import Any +from typing import BinaryIO +from typing import ClassVar +from typing import cast +from urllib.parse import urlparse +from zipfile import BadZipFile +from zipfile import ZipFile + +from pkginfo import Distribution +from requests.models import CONTENT_CHUNK_SIZE +from requests.models import HTTPError +from requests.models import Response +from requests.status_codes import codes + + +if TYPE_CHECKING: + from collections.abc import Iterable + from collections.abc import Iterator + + from requests import Session + + from poetry.utils.authenticator import Authenticator + + +logger = logging.getLogger(__name__) + + +class HTTPRangeRequestUnsupported(Exception): + """Raised when the remote server appears unable to support byte ranges.""" + + +class UnsupportedWheel(Exception): + """Unsupported wheel.""" + + +class InvalidWheel(Exception): + """Invalid (e.g. corrupt) wheel.""" + + def __init__(self, location: str, name: str): + self.location = location + self.name = name + + def __str__(self) -> str: + return f"Wheel '{self.name}' located at {self.location} is invalid." + + +class MemoryWheel(Distribution): + def __init__(self, lazy_file: LazyWheelOverHTTP) -> None: + self.lazy_file = lazy_file + self.extractMetadata() + + def read(self) -> bytes: + with ZipFile(self.lazy_file) as archive: + tuples = [x.split("/") for x in archive.namelist() if "METADATA" in x] + schwarz = sorted([(len(x), x) for x in tuples]) + for path in [x[1] for x in schwarz]: + candidate = "/".join(path) + logger.debug(f"read {candidate}") + data = archive.read(candidate) + if b"Metadata-Version" in data: + return data + else: + raise ValueError(f"No METADATA in archive: {self.lazy_file.name}") + + +def memory_wheel_from_url( + name: str, url: str, session: Session | Authenticator +) -> MemoryWheel: + """Return a MemoryWheel (compatible to pkginfo.Wheel) from the given wheel URL. + + This uses HTTP range requests to only fetch the portion of the wheel + containing metadata, just enough for the object to be constructed. + + :raises HTTPRangeRequestUnsupported: if range requests are unsupported for ``url``. + :raises InvalidWheel: if the zip file contents could not be parsed. + """ + try: + # After context manager exit, wheel.name will point to a deleted file path. + # Add `delete_backing_file=False` to disable this for debugging. + with LazyWheelOverHTTP(url, session) as lazy_file: + # prefetch metadata to reduce the number of range requests + # (we know that METADATA is the only file from the wheel we need) + lazy_file.prefetch_metadata(name) + return MemoryWheel(lazy_file) + + except (BadZipFile, UnsupportedWheel): + # We assume that these errors have occurred because the wheel contents + # themselves are invalid, not because we've messed up our bookkeeping + # and produced an invalid file. + raise InvalidWheel(url, name) + + +class MergeIntervals: + """Stateful bookkeeping to merge interval graphs.""" + + def __init__(self, *, left: Iterable[int] = (), right: Iterable[int] = ()) -> None: + self._left = list(left) + self._right = list(right) + + def __repr__(self) -> str: + return ( + f"{type(self).__name__}" + f"(left={tuple(self._left)}, right={tuple(self._right)})" + ) + + def _merge( + self, start: int, end: int, left: int, right: int + ) -> Iterator[tuple[int, int]]: + """Return an iterator of intervals to be fetched. + + Args: + start (int): Start of needed interval + end (int): End of needed interval + left (int): Index of first overlapping downloaded data + right (int): Index after last overlapping downloaded data + """ + lslice, rslice = self._left[left:right], self._right[left:right] + i = start = min([start] + lslice[:1]) + end = max([end] + rslice[-1:]) + for j, k in zip(lslice, rslice): + if j > i: + yield i, j - 1 + i = k + 1 + if i <= end: + yield i, end + self._left[left:right], self._right[left:right] = [start], [end] + + def minimal_intervals_covering( + self, start: int, end: int + ) -> Iterator[tuple[int, int]]: + """Provide the intervals needed to cover from ``start <= x <= end``. + + This method mutates internal state so that later calls only return intervals not + covered by prior calls. The first call to this method will always return exactly + one interval, which was exactly the one requested. Later requests for + intervals overlapping that first requested interval will yield only the ranges + not previously covered (which may be empty, e.g. if the same interval is + requested twice). + + This may be used e.g. to download substrings of remote files on demand. + """ + left = bisect_left(self._right, start) + right = bisect_right(self._left, end) + yield from self._merge(start, end, left, right) + + +class ReadOnlyIOWrapper(BinaryIO): + """Implement read-side ``BinaryIO`` methods wrapping an inner ``BinaryIO``. + + This wrapper is useful because Python currently does not distinguish read-only + streams at the type level. + """ + + def __init__(self, inner: BinaryIO) -> None: + self._file = inner + + @property + def mode(self) -> str: + """Opening mode, which is always rb.""" + return "rb" + + @property + def name(self) -> str: + """Path to the underlying file.""" + return self._file.name + + def seekable(self) -> bool: + """Return whether random access is supported, which is True.""" + return True + + def close(self) -> None: + """Close the file.""" + self._file.close() + + @property + def closed(self) -> bool: + """Whether the file is closed.""" + return self._file.closed + + def fileno(self) -> int: + return self._file.fileno() + + def flush(self) -> None: + self._file.flush() + + def isatty(self) -> bool: + return False + + def readable(self) -> bool: + """Return whether the file is readable, which is True.""" + return True + + def read(self, size: int = -1) -> bytes: + """Read up to size bytes from the object and return them. + + As a convenience, if size is unspecified or -1, + all bytes until EOF are returned. Fewer than + size bytes may be returned if EOF is reached. + """ + return self._file.read(size) + + def readline(self, limit: int = -1) -> bytes: + # Explicit impl needed to satisfy mypy. + raise NotImplementedError + + def readlines(self, hint: int = -1) -> list[bytes]: + raise NotImplementedError + + def seek(self, offset: int, whence: int = 0) -> int: + """Change stream position and return the new absolute position. + + Seek to offset relative position indicated by whence: + * 0: Start of stream (the default). pos should be >= 0; + * 1: Current position - pos may be negative; + * 2: End of stream - pos usually negative. + """ + return self._file.seek(offset, whence) + + def tell(self) -> int: + """Return the current position.""" + return self._file.tell() + + def truncate(self, size: int | None = None) -> int: + """Resize the stream to the given size in bytes. + + If size is unspecified resize to the current position. + The current stream position isn't changed. + + Return the new file size. + """ + return self._file.truncate(size) + + def writable(self) -> bool: + """Return False.""" + return False + + def write(self, s: Any) -> int: + raise NotImplementedError + + def writelines(self, lines: Iterable[Any]) -> None: + raise NotImplementedError + + def __enter__(self) -> ReadOnlyIOWrapper: + self._file.__enter__() + return self + + def __exit__(self, *exc: Any) -> None: + self._file.__exit__(*exc) + + def __iter__(self) -> Iterator[bytes]: + raise NotImplementedError + + def __next__(self) -> bytes: + raise NotImplementedError + + +class LazyRemoteResource(ReadOnlyIOWrapper): + """Abstract class for a binary file object that lazily downloads its contents. + + This class uses a ``MergeIntervals`` instance to keep track of what it's downloaded. + """ + + def __init__(self, inner: BinaryIO) -> None: + super().__init__(inner) + self._merge_intervals: MergeIntervals | None = None + + @abc.abstractmethod + def fetch_content_range(self, start: int, end: int) -> Iterator[bytes]: + """Call to the remote backend to provide this byte range in one or more chunks. + + NB: For compatibility with HTTP range requests, the range provided to this + method must *include* the byte indexed at argument ``end`` (so e.g. ``0-1`` is 2 + bytes long, and the range can never be empty). + + :raises Exception: implementations may raise an exception for e.g. intermittent + errors accessing the remote resource. + """ + ... + + def _setup_content(self) -> None: + """Ensure ``self._merge_intervals`` is initialized. + + Called in ``__enter__``, and should make recursive invocations into a no-op. + Subclasses may override this method.""" + if self._merge_intervals is None: + self._merge_intervals = MergeIntervals() + + def _reset_content(self) -> None: + """Unset ``self._merge_intervals``. + + Called in ``__exit__``, and should make recursive invocations into a no-op. + Subclasses may override this method.""" + if self._merge_intervals is not None: + logger.debug( + "unsetting merge intervals (were: %s)", repr(self._merge_intervals) + ) + self._merge_intervals = None + + def __enter__(self) -> LazyRemoteResource: + """Call ``self._setup_content()``, then return self.""" + super().__enter__() + self._setup_content() + return self + + def __exit__(self, *exc: Any) -> None: + """Call ``self._reset_content()``.""" + self._reset_content() + super().__exit__(*exc) + + @contextmanager + def _stay(self) -> Iterator[None]: + """Return a context manager keeping the position. + + At the end of the block, seek back to original position. + """ + pos = self.tell() + try: + yield + finally: + self.seek(pos) + + def ensure_downloaded(self, start: int, end: int) -> None: + """Ensures bytes start to end (inclusive) have been downloaded and written to + the backing file. + + :raises ValueError: if ``__enter__`` was not called beforehand. + """ + if self._merge_intervals is None: + raise ValueError(".__enter__() must be called to set up merge intervals") + # Reducing by 1 to get an inclusive end range. + end -= 1 + with self._stay(): + for ( + range_start, + range_end, + ) in self._merge_intervals.minimal_intervals_covering(start, end): + self.seek(start) + for chunk in self.fetch_content_range(range_start, range_end): + self._file.write(chunk) + + +class FixedSizeLazyResource(LazyRemoteResource): + """Fetches a fixed content length from the remote server when initialized in order + to support ``.read(-1)`` and seek calls against ``io.SEEK_END``.""" + + def __init__(self, inner: BinaryIO) -> None: + super().__init__(inner) + self._length: int | None = None + + @abc.abstractmethod + def _fetch_content_length(self) -> int: + """Query the remote resource for the total length of the file. + + This method may also mutate internal state, such as writing to the backing + file. It's marked private because it will only be called within this class's + ``__enter__`` implementation to populate an internal length field. + + :raises Exception: implementations may raise any type of exception if the length + value could not be parsed, or any other issue which might + cause valid calls to ``self.fetch_content_range()`` to fail. + """ + raise NotImplementedError + + def _setup_content(self) -> None: + """Initialize the internal length field and other bookkeeping. + + After parsing the remote file length with ``self._fetch_content_length()``, + this method will truncate the underlying file from parent abstract class + ``ReadOnlyIOWrapper`` to that size in order to support seek operations against + ``io.SEEK_END`` in ``self.read()``. + + This method is called in ``__enter__``. + """ + super()._setup_content() + + if self._length is None: + logger.debug("begin fetching content length") + self._length = self._fetch_content_length() + logger.debug("done fetching content length (is: %d)", self._length) + else: + logger.debug("content length already fetched (is: %d)", self._length) + + # Enable us to seek and write anywhere in the backing file up to this + # known length. + self.truncate(self._length) + + def _reset_content(self) -> None: + """Unset the internal length field and other bookkeeping. + + This method is called in ``__exit__``.""" + super()._reset_content() + + if self._length is not None: + logger.debug("unsetting content length (was: %d)", self._length) + self._length = None + + def read(self, size: int = -1) -> bytes: + """Read up to size bytes from the object and return them. + + As a convenience, if size is unspecified or -1, + all bytes until EOF are returned. Fewer than + size bytes may be returned if EOF is reached. + + :raises ValueError: if ``__enter__`` was not called beforehand. + """ + if self._length is None: + raise ValueError(".__enter__() must be called to set up content length") + cur = self.tell() + logger.debug("read size %d at %d from lazy file %s", size, cur, self.name) + if size < 0: + assert cur <= self._length + download_size = self._length - cur + elif size == 0: + return b"" + else: + download_size = size + stop = min(cur + download_size, self._length) + self.ensure_downloaded(cur, stop) + return super().read(download_size) + + +class LazyHTTPFile(FixedSizeLazyResource): + """File-like object representing a fixed-length file over HTTP. + + This uses HTTP range requests to lazily fetch the file's content into a temporary + file. If such requests are not supported by the server, raises + ``HTTPRangeRequestUnsupported`` in the ``__enter__`` method.""" + + def __init__( + self, + url: str, + session: Session | Authenticator, + delete_backing_file: bool = True, + ) -> None: + super().__init__(cast(BinaryIO, NamedTemporaryFile(delete=delete_backing_file))) + + self._request_count = 0 + self._session = session + self._url = url + + def _fetch_content_length(self) -> int: + """Get the remote file's length from a HEAD request.""" + # NB: This is currently dead code, as _fetch_content_length() is overridden + # again in LazyWheelOverHTTP. + return self._content_length_from_head() + + def fetch_content_range(self, start: int, end: int) -> Iterator[bytes]: + """Perform a series of HTTP range requests to cover the specified byte range.""" + yield from self._stream_response(start, end).iter_content(CONTENT_CHUNK_SIZE) + + def __enter__(self) -> LazyHTTPFile: + """Fetch the remote file length and reset the log of downloaded intervals. + + This method must be called before ``.read()``. + + :raises HTTPRangeRequestUnsupported: if the remote server fails to indicate + support for "bytes" ranges. + """ + super().__enter__() + return self + + def __exit__(self, *exc: Any) -> None: + """Logs request count to quickly identify any pathological cases in log data.""" + logger.debug("%d requests for url %s", self._request_count, self._url) + super().__exit__(*exc) + + def _content_length_from_head(self) -> int: + """Performs a HEAD request to extract the Content-Length. + + :raises HTTPRangeRequestUnsupported: if the response fails to indicate support + for "bytes" ranges.""" + self._request_count += 1 + head = self._session.head(self._url, headers=self._uncached_headers()) + head.raise_for_status() + assert head.status_code == codes.ok + accepted_range = head.headers.get("Accept-Ranges", None) + if accepted_range != "bytes": + raise HTTPRangeRequestUnsupported( + f"server does not support byte ranges: header was '{accepted_range}'" + ) + return int(head.headers["Content-Length"]) + + def _stream_response(self, start: int, end: int) -> Response: + """Return streaming HTTP response to a range request from start to end.""" + headers = self._uncached_headers() + headers["Range"] = f"bytes={start}-{end}" + logger.debug("streamed bytes request: %s", headers["Range"]) + self._request_count += 1 + response = self._session.get(self._url, headers=headers, stream=True) + response.raise_for_status() + assert int(response.headers["Content-Length"]) == (end - start + 1) + return response + + @classmethod + def _uncached_headers(cls) -> dict[str, str]: + """HTTP headers to bypass any HTTP caching. + + The requests we perform in this file are intentionally small, and any caching + should be done at a higher level. + + Further, caching partial requests might cause issues: + https://github.com/pypa/pip/pull/8716 + """ + # "no-cache" is the correct value for "up to date every time", so this will also + # ensure we get the most recent value from the server: + # https://developer.mozilla.org/en-US/docs/Web/HTTP/Caching#provide_up-to-date_content_every_time + return {"Accept-Encoding": "identity", "Cache-Control": "no-cache"} + + +class LazyWheelOverHTTP(LazyHTTPFile): + """File-like object mapped to a ZIP file over HTTP. + + This uses HTTP range requests to lazily fetch the file's content, which should be + provided as the first argument to a ``ZipFile``. + """ + + # Cache this on the type to avoid trying and failing our initial lazy wheel request + # multiple times in the same invocation against an index without this support. + _domains_without_negative_range: ClassVar[set[str]] = set() + + # This override is needed for mypy so we can call ``.prefetch_metadata()`` + # within a ``with`` block. + def __enter__(self) -> LazyWheelOverHTTP: + """Fetch the remote file length and reset the log of downloaded intervals. + + This method must be called before ``.read()`` or ``.prefetch_metadata()``. + """ + super().__enter__() + return self + + @classmethod + def _initial_chunk_length(cls) -> int: + """Return the size of the chunk (in bytes) to download from the end of the file. + + This method is called in ``self._fetch_content_length()``. As noted in that + method's docstring, this should be set high enough to cover the central + directory sizes of the *average* wheels you expect to see, in order to avoid + further requests before being able to process the zip file's contents at all. + If we choose a small number, we need one more range request for larger wheels. + If we choose a big number, we download unnecessary data from smaller wheels. + If the chunk size from this method is larger than the size of an entire wheel, + that may raise an HTTP error, but this is gracefully handled in + ``self._fetch_content_length()`` with a small performance penalty. + """ + return 10_000 + + def _fetch_content_length(self) -> int: + """Get the total remote file length, but also download a chunk from the end. + + This method is called within ``__enter__``. In an attempt to reduce + the total number of requests needed to populate this lazy file's contents, this + method will also attempt to fetch a chunk of the file's actual content. This + chunk will be ``self._initial_chunk_length()`` bytes in size, or just the remote + file's length if that's smaller, and the chunk will come from the *end* of + the file. + + This method will first attempt to download with a negative byte range request, + i.e. a GET with the headers ``Range: bytes=-N`` for ``N`` equal to + ``self._initial_chunk_length()``. If negative offsets are unsupported, it will + instead fall back to making a HEAD request first to extract the length, followed + by a GET request with the double-ended range header ``Range: bytes=X-Y`` to + extract the final ``N`` bytes from the remote resource. + """ + initial_chunk_size = self._initial_chunk_length() + ret_length, tail = self._extract_content_length(initial_chunk_size) + + # Need to explicitly truncate here in order to perform the write and seek + # operations below when we write the chunk of file contents to disk. + self.truncate(ret_length) + + if tail is None: + # If we could not download any file contents yet (e.g. if negative byte + # ranges were not supported, or the requested range was larger than the file + # size), then download all of this at once, hopefully pulling in the entire + # central directory. + initial_start = max(0, ret_length - initial_chunk_size) + self.ensure_downloaded(initial_start, ret_length) + else: + # If we *could* download some file contents, then write them to the end of + # the file and set up our bisect boundaries by hand. + with self._stay(): + response_length = int(tail.headers["Content-Length"]) + assert response_length == min(initial_chunk_size, ret_length) + self.seek(-response_length, io.SEEK_END) + # Default initial chunk size is currently 1MB, but streaming content + # here allows it to be set arbitrarily large. + for chunk in tail.iter_content(CONTENT_CHUNK_SIZE): + self._file.write(chunk) + + # We now need to update our bookkeeping to cover the interval we just + # wrote to file so we know not to do it in later read()s. + init_chunk_start = ret_length - response_length + # MergeIntervals uses inclusive boundaries i.e. start <= x <= end. + init_chunk_end = ret_length - 1 + assert self._merge_intervals is not None + assert ((init_chunk_start, init_chunk_end),) == tuple( + # NB: We expect LazyRemoteResource to reset `self._merge_intervals` + # just before it calls the current method, so our assertion here + # checks that indeed no prior overlapping intervals have + # been covered. + self._merge_intervals.minimal_intervals_covering( + init_chunk_start, init_chunk_end + ) + ) + return ret_length + + @staticmethod + def _parse_full_length_from_content_range(arg: str) -> int: + """Parse the file's full underlying length from the Content-Range header. + + This supports both * and numeric ranges, from success or error responses: + https://www.rfc-editor.org/rfc/rfc9110#field.content-range. + """ + m = re.match(r"bytes [^/]+/([0-9]+)", arg) + if m is None: + raise HTTPRangeRequestUnsupported(f"could not parse Content-Range: '{arg}'") + return int(m.group(1)) + + def _try_initial_chunk_request( + self, initial_chunk_size: int + ) -> tuple[int, Response]: + """Attempt to fetch a chunk from the end of the file with a negative offset.""" + headers = self._uncached_headers() + # Perform a negative range index, which is not supported by some servers. + headers["Range"] = f"bytes=-{initial_chunk_size}" + logger.debug("initial bytes request: %s", headers["Range"]) + + self._request_count += 1 + tail = self._session.get(self._url, headers=headers, stream=True) + tail.raise_for_status() + + code = tail.status_code + if code != codes.partial_content: + # According to + # https://developer.mozilla.org/en-US/docs/Web/HTTP/Range_requests, + # a 200 OK implies that range requests are not supported, + # regardless of the requested size. + # However, some servers that support negative range requests also return a + # 200 OK if the requested range from the end was larger than the file size. + if code == codes.ok: + accept_ranges = tail.headers.get("Accept-Ranges", None) + content_length = int(tail.headers["Content-Length"]) + if accept_ranges == "bytes" and content_length <= initial_chunk_size: + return content_length, tail + + raise HTTPRangeRequestUnsupported( + f"did not receive partial content: got code {code}" + ) + + file_length = self._parse_full_length_from_content_range( + tail.headers["Content-Range"] + ) + return (file_length, tail) + + def _extract_content_length( + self, initial_chunk_size: int + ) -> tuple[int, Response | None]: + """Get the Content-Length of the remote file, and possibly a chunk of it.""" + domain = urlparse(self._url).netloc + if domain in self._domains_without_negative_range: + return (self._content_length_from_head(), None) + + tail: Response | None + try: + # Initial range request for just the end of the file. + file_length, tail = self._try_initial_chunk_request(initial_chunk_size) + except HTTPError as e: + resp = e.response + code = resp.status_code if resp is not None else None + # Our initial request using a negative byte range was not supported. + if code in [codes.not_implemented, codes.method_not_allowed]: + # pypi notably does not support negative byte ranges: see + # https://github.com/pypi/warehouse/issues/12823. + logger.debug( + "Negative byte range not supported for domain '%s': " + "using HEAD request before lazy wheel from now on", + domain, + ) + # Avoid trying a negative byte range request against this domain for the + # rest of the resolve. + self._domains_without_negative_range.add(domain) + # Apply a HEAD request to get the real size, and nothing else for now. + return (self._content_length_from_head(), None) + # This indicates that the requested range from the end was larger than the + # actual file size: https://www.rfc-editor.org/rfc/rfc9110#status.416. + if code == codes.requested_range_not_satisfiable: + # In this case, we don't have any file content yet, but we do know the + # size the file will be, so we can return that and exit here. + assert resp is not None + file_length = self._parse_full_length_from_content_range( + resp.headers["Content-Range"] + ) + return (file_length, None) + # If we get some other error, then we expect that non-range requests will + # also fail, so we error out here and let the user figure it out. + raise + + # Some servers that do not support negative offsets, + # handle a negative offset like "-10" as "0-10". + if int( + tail.headers["Content-Length"] + ) == initial_chunk_size + 1 and tail.headers["Content-Range"].startswith( + f"bytes 0-{initial_chunk_size}" + ): + tail = None + self._domains_without_negative_range.add(domain) + return file_length, tail + + def prefetch_metadata(self, name: str) -> None: + """Locate the *.dist-info/METADATA entry from a temporary ``ZipFile`` wrapper, + and download it. + + This method assumes that the *.dist-info directory (containing e.g. METADATA) is + contained in a single contiguous section of the zip file in order to ensure it + can be downloaded in a single ranged GET request.""" + logger.debug("begin prefetching METADATA for %s", name) + + dist_info_prefix = re.compile(r"^[^/]*\.dist-info/METADATA") + start: int | None = None + end: int | None = None + + # This may perform further requests if __init__() did not pull in the entire + # central directory at the end of the file (although _initial_chunk_length() + # should be set large enough to avoid this). + zf = ZipFile(self) + + for info in zf.infolist(): + if start is None: + if dist_info_prefix.search(info.filename): + start = info.header_offset + continue + else: + # The last .dist-info/ entry may be before the end of the file if the + # wheel's entries are sorted lexicographically (which is unusual). + if not dist_info_prefix.search(info.filename): + end = info.header_offset + break + if start is None: + raise UnsupportedWheel( + f"no {dist_info_prefix!r} found for {name} in {self.name}" + ) + # If it is the last entry of the zip, then give us everything + # until the start of the central directory. + if end is None: + end = zf.start_dir + logger.debug("fetch METADATA") + self.ensure_downloaded(start, end) + logger.debug("done prefetching METADATA for %s", name) diff --git a/src/poetry/repositories/http_repository.py b/src/poetry/repositories/http_repository.py index 826703a3760..546dfc4dcdc 100644 --- a/src/poetry/repositories/http_repository.py +++ b/src/poetry/repositories/http_repository.py @@ -19,12 +19,16 @@ from poetry.core.utils.helpers import temporary_directory from poetry.core.version.markers import parse_marker +from poetry.config.config import Config +from poetry.inspection.lazy_wheel import HTTPRangeRequestUnsupported +from poetry.inspection.lazy_wheel import memory_wheel_from_url from poetry.repositories.cached_repository import CachedRepository from poetry.repositories.exceptions import PackageNotFound from poetry.repositories.exceptions import RepositoryError from poetry.repositories.link_sources.html import HTMLPage from poetry.utils.authenticator import Authenticator from poetry.utils.constants import REQUESTS_TIMEOUT +from poetry.utils.helpers import HTTPRangeRequestSupported from poetry.utils.helpers import download_file from poetry.utils.patterns import wheel_file_re @@ -32,7 +36,6 @@ if TYPE_CHECKING: from packaging.utils import NormalizedName - from poetry.config.config import Config from poetry.inspection.info import PackageInfo from poetry.repositories.link_sources.base import LinkSource from poetry.utils.authenticator import RepositoryCertificateConfig @@ -49,6 +52,8 @@ def __init__( ) -> None: super().__init__(name, disable_cache, config) self._url = url + if config is None: + config = Config.create() self._authenticator = Authenticator( config=config, cache_id=name, @@ -58,6 +63,16 @@ def __init__( self._authenticator.add_repository(name, url) self.get_page = functools.lru_cache(maxsize=None)(self._get_page) + self._lazy_wheel = config.get("solver.lazy-wheel", True) + # We are tracking if a domain supports range requests or not to avoid + # unnecessary requests. + # ATTENTION: A domain might support range requests only for some files, so the + # meaning is as follows: + # - Domain not in dict: We don't know anything. + # - True: The domain supports range requests for at least some files. + # - False: The domain does not support range requests for the files we tried. + self._supports_range_requests: dict[str, bool] = {} + @property def session(self) -> Authenticator: return self._authenticator @@ -74,22 +89,63 @@ def certificates(self) -> RepositoryCertificateConfig: def authenticated_url(self) -> str: return self._authenticator.authenticated_url(url=self.url) - def _download(self, url: str, dest: Path) -> None: - return download_file(url, dest, session=self.session) + def _download( + self, url: str, dest: Path, *, raise_accepts_ranges: bool = False + ) -> None: + return download_file( + url, dest, session=self.session, raise_accepts_ranges=raise_accepts_ranges + ) @contextmanager - def _cached_or_downloaded_file(self, link: Link) -> Iterator[Path]: + def _cached_or_downloaded_file( + self, link: Link, *, raise_accepts_ranges: bool = False + ) -> Iterator[Path]: self._log(f"Downloading: {link.url}", level="debug") with temporary_directory() as temp_dir: filepath = Path(temp_dir) / link.filename - self._download(link.url, filepath) + self._download( + link.url, filepath, raise_accepts_ranges=raise_accepts_ranges + ) yield filepath def _get_info_from_wheel(self, url: str) -> PackageInfo: from poetry.inspection.info import PackageInfo - with self._cached_or_downloaded_file(Link(url)) as filepath: - return PackageInfo.from_wheel(filepath) + link = Link(url) + netloc = link.netloc + + # If "lazy-wheel" is enabled and the domain supports range requests + # or we don't know yet, we try range requests. + if self._lazy_wheel and self._supports_range_requests.get(netloc, True): + try: + package_info = PackageInfo.from_memory_wheel( + memory_wheel_from_url(link.filename, link.url, self.session) + ) + except HTTPRangeRequestUnsupported: + # Do not set to False if we already know that the domain supports + # range requests for some URLs! + if netloc not in self._supports_range_requests: + self._supports_range_requests[netloc] = False + else: + self._supports_range_requests[netloc] = True + return package_info + + try: + with self._cached_or_downloaded_file( + link, raise_accepts_ranges=self._lazy_wheel + ) as filepath: + return PackageInfo.from_wheel(filepath) + except HTTPRangeRequestSupported: + # The domain did not support range requests for the first URL(s) we tried, + # but supports it for some URLs (especially the current URL), + # so we abort the download, update _supports_range_requests to try + # range requests for all files and use it for the current URL. + self._log( + f"Abort downloading {link.url} because server supports range requests", + level="debug", + ) + self._supports_range_requests[netloc] = True + return self._get_info_from_wheel(link.url) def _get_info_from_sdist(self, url: str) -> PackageInfo: from poetry.inspection.info import PackageInfo diff --git a/src/poetry/utils/authenticator.py b/src/poetry/utils/authenticator.py index 2628e5ba699..c5e913db630 100644 --- a/src/poetry/utils/authenticator.py +++ b/src/poetry/utils/authenticator.py @@ -270,6 +270,10 @@ def _get_backoff(self, response: requests.Response | None, attempt: int) -> floa def get(self, url: str, **kwargs: Any) -> requests.Response: return self.request("get", url, **kwargs) + def head(self, url: str, **kwargs: Any) -> requests.Response: + kwargs.setdefault("allow_redirects", False) + return self.request("head", url, **kwargs) + def post(self, url: str, **kwargs: Any) -> requests.Response: return self.request("post", url, **kwargs) diff --git a/src/poetry/utils/helpers.py b/src/poetry/utils/helpers.py index e2ee03b467b..41994fee3e9 100644 --- a/src/poetry/utils/helpers.py +++ b/src/poetry/utils/helpers.py @@ -98,16 +98,25 @@ def merge_dicts(d1: dict[str, Any], d2: dict[str, Any]) -> None: d1[k] = d2[k] +class HTTPRangeRequestSupported(Exception): + """Raised when server unexpectedly supports byte ranges.""" + + def download_file( url: str, dest: Path, + *, session: Authenticator | Session | None = None, chunk_size: int = 1024, + raise_accepts_ranges: bool = False, ) -> None: from poetry.puzzle.provider import Indicator downloader = Downloader(url, dest, session) + if raise_accepts_ranges and downloader.accepts_ranges: + raise HTTPRangeRequestSupported(f"URL {url} supports range requests.") + set_indicator = False with Indicator.context() as update_context: update_context(f"Downloading {url}") @@ -146,6 +155,10 @@ def __init__( ) self._response.raise_for_status() + @cached_property + def accepts_ranges(self) -> bool: + return self._response.headers.get("Accept-Ranges") == "bytes" + @cached_property def total_size(self) -> int: total_size = 0 diff --git a/tests/console/commands/test_config.py b/tests/console/commands/test_config.py index 24b89a63d9b..6c15e6a35fb 100644 --- a/tests/console/commands/test_config.py +++ b/tests/console/commands/test_config.py @@ -13,6 +13,7 @@ from poetry.config.config_source import ConfigSource from poetry.console.commands.install import InstallCommand from poetry.factory import Factory +from poetry.repositories.legacy_repository import LegacyRepository from tests.conftest import Config @@ -58,6 +59,7 @@ def test_list_displays_default_value_if_not_set( installer.modern-installation = true installer.no-binary = null installer.parallel = true +solver.lazy-wheel = true virtualenvs.create = true virtualenvs.in-project = null virtualenvs.options.always-copy = false @@ -88,6 +90,7 @@ def test_list_displays_set_get_setting( installer.modern-installation = true installer.no-binary = null installer.parallel = true +solver.lazy-wheel = true virtualenvs.create = false virtualenvs.in-project = null virtualenvs.options.always-copy = false @@ -139,6 +142,7 @@ def test_unset_setting( installer.modern-installation = true installer.no-binary = null installer.parallel = true +solver.lazy-wheel = true virtualenvs.create = true virtualenvs.in-project = null virtualenvs.options.always-copy = false @@ -168,6 +172,7 @@ def test_unset_repo_setting( installer.modern-installation = true installer.no-binary = null installer.parallel = true +solver.lazy-wheel = true virtualenvs.create = true virtualenvs.in-project = null virtualenvs.options.always-copy = false @@ -296,6 +301,7 @@ def test_list_displays_set_get_local_setting( installer.modern-installation = true installer.no-binary = null installer.parallel = true +solver.lazy-wheel = true virtualenvs.create = false virtualenvs.in-project = null virtualenvs.options.always-copy = false @@ -334,6 +340,7 @@ def test_list_must_not_display_sources_from_pyproject_toml( installer.no-binary = null installer.parallel = true repositories.foo.url = "https://foo.bar/simple/" +solver.lazy-wheel = true virtualenvs.create = true virtualenvs.in-project = null virtualenvs.options.always-copy = false @@ -531,3 +538,21 @@ def test_config_installer_no_binary( config = Config.create(reload=True) assert not DeepDiff(config.get(setting), expected, ignore_order=True) + + +def test_config_solver_lazy_wheel( + tester: CommandTester, command_tester_factory: CommandTesterFactory +) -> None: + tester.execute("--local solver.lazy-wheel") + assert tester.io.fetch_output().strip() == "true" + + repo = LegacyRepository("foo", "https://foo.com") + assert repo._lazy_wheel + + tester.io.clear_output() + tester.execute("--local solver.lazy-wheel false") + tester.execute("--local solver.lazy-wheel") + assert tester.io.fetch_output().strip() == "false" + + repo = LegacyRepository("foo", "https://foo.com") + assert not repo._lazy_wheel diff --git a/tests/fixtures/distributions/demo_missing_dist_info-0.1.0-py2.py3-none-any.whl b/tests/fixtures/distributions/demo_missing_dist_info-0.1.0-py2.py3-none-any.whl new file mode 100644 index 0000000000000000000000000000000000000000..d59d962e77f1c8c87078f970ccc8a64d99e2f4be GIT binary patch literal 310 zcmWIWW@Zs#0Dn`T;;CTu?zph9<7UmtsI32n)cJ#K&jmWtPOp z>lIYS$CsrR6=&w>#m6hyDySRi8R{9Ra|L)aGTAfWvPcDLI> None: +def mock_download( + url: str, + dest: Path, + *, + session: Authenticator | Session | None = None, + chunk_size: int = 1024, + raise_accepts_ranges: bool = False, +) -> None: parts = urllib.parse.urlparse(url) fixture = FIXTURE_PATH / parts.path.lstrip("/") diff --git a/tests/inspection/conftest.py b/tests/inspection/conftest.py new file mode 100644 index 00000000000..a876ada578e --- /dev/null +++ b/tests/inspection/conftest.py @@ -0,0 +1,146 @@ +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING +from typing import Any +from typing import Protocol +from urllib.parse import urlparse + +import pytest + +from requests import codes + + +if TYPE_CHECKING: + from collections.abc import Callable + + from httpretty.core import HTTPrettyRequest + + from tests.types import FixtureDirGetter + + HttPrettyResponse = tuple[int, dict[str, Any], bytes] # status code, headers, body + HttPrettyRequestCallback = Callable[ + [HTTPrettyRequest, str, dict[str, Any]], HttPrettyResponse + ] + + class RequestCallbackFactory(Protocol): + def __call__( + self, + *, + accept_ranges: str | None = "bytes", + negative_offset_error: tuple[int, bytes] | None = None, + ) -> HttPrettyRequestCallback: ... + + +NEGATIVE_OFFSET_AS_POSITIVE = -1 + + +def build_head_response( + accept_ranges: str | None, content_length: int, response_headers: dict[str, Any] +) -> HttPrettyResponse: + response_headers["Content-Length"] = content_length + if accept_ranges: + response_headers["Accept-Ranges"] = accept_ranges + return 200, response_headers, b"" + + +def build_partial_response( + rng: str, + wheel_bytes: bytes, + response_headers: dict[str, Any], + *, + negative_offset_as_positive: bool = False, +) -> HttPrettyResponse: + status_code = 206 + response_headers["Accept-Ranges"] = "bytes" + total_length = len(wheel_bytes) + if rng.startswith("-"): + # negative offset + offset = int(rng) + if negative_offset_as_positive: + # some servers interpret a negative offset like "-10" as "0-10" + start = 0 + end = min(-offset, total_length - 1) + body = wheel_bytes[start : end + 1] + else: + start = total_length + offset + if start < 0: + # wheel is smaller than initial chunk size + return 200, response_headers, wheel_bytes + end = total_length - 1 + body = wheel_bytes[offset:] + else: + # range with start and end + rng_parts = rng.split("-") + start = int(rng_parts[0]) + end = int(rng_parts[1]) + body = wheel_bytes[start : end + 1] + response_headers["Content-Range"] = f"bytes {start}-{end}/{total_length}" + return status_code, response_headers, body + + +@pytest.fixture +def handle_request_factory(fixture_dir: FixtureDirGetter) -> RequestCallbackFactory: + def _factory( + *, + accept_ranges: str | None = "bytes", + negative_offset_error: tuple[int, bytes] | None = None, + ) -> HttPrettyRequestCallback: + def handle_request( + request: HTTPrettyRequest, uri: str, response_headers: dict[str, Any] + ) -> HttPrettyResponse: + name = Path(urlparse(uri).path).name + + wheel = Path(__file__).parent.parent.joinpath( + "repositories/fixtures/pypi.org/dists/" + name + ) + + if not wheel.exists(): + wheel = fixture_dir("distributions") / name + + if not wheel.exists(): + wheel = ( + fixture_dir("distributions") / "demo-0.1.0-py2.py3-none-any.whl" + ) + + wheel_bytes = wheel.read_bytes() + + del response_headers["status"] + + if request.method == "HEAD": + return build_head_response( + accept_ranges, len(wheel_bytes), response_headers + ) + + rng = request.headers.get("Range", "") + if rng: + rng = rng.split("=")[1] + + negative_offset_as_positive = False + if negative_offset_error and rng.startswith("-"): + if negative_offset_error[0] == codes.requested_range_not_satisfiable: + response_headers["Content-Range"] = f"bytes */{len(wheel_bytes)}" + if negative_offset_error[0] == NEGATIVE_OFFSET_AS_POSITIVE: + negative_offset_as_positive = True + else: + return ( + negative_offset_error[0], + response_headers, + negative_offset_error[1], + ) + if accept_ranges == "bytes" and rng: + return build_partial_response( + rng, + wheel_bytes, + response_headers, + negative_offset_as_positive=negative_offset_as_positive, + ) + + status_code = 200 + body = wheel_bytes + + return status_code, response_headers, body + + return handle_request + + return _factory diff --git a/tests/inspection/test_info.py b/tests/inspection/test_info.py index 5c1a757bcaf..6ca3fdb103c 100644 --- a/tests/inspection/test_info.py +++ b/tests/inspection/test_info.py @@ -4,9 +4,12 @@ from typing import TYPE_CHECKING import pytest +import requests from poetry.inspection.info import PackageInfo from poetry.inspection.info import PackageInfoError +from poetry.inspection.lazy_wheel import MemoryWheel +from poetry.inspection.lazy_wheel import memory_wheel_from_url from poetry.utils.env import EnvCommandError from poetry.utils.env import VirtualEnv @@ -14,8 +17,10 @@ if TYPE_CHECKING: from pathlib import Path + from httpretty import httpretty from pytest_mock import MockerFixture + from tests.inspection.conftest import RequestCallbackFactory from tests.types import FixtureDirGetter @@ -34,6 +39,18 @@ def demo_wheel(fixture_dir: FixtureDirGetter) -> Path: return fixture_dir("distributions") / "demo-0.1.0-py2.py3-none-any.whl" +@pytest.fixture +def demo_memory_wheel( + http: type[httpretty], + handle_request_factory: RequestCallbackFactory, +) -> MemoryWheel: + url = "https://foo.com/demo-0.1.0-py2.py3-none-any.whl" + request_callback = handle_request_factory() + http.register_uri(http.GET, url, body=request_callback) + + return memory_wheel_from_url("demo", url, requests.Session()) + + @pytest.fixture def source_dir(tmp_path: Path) -> Path: path = tmp_path / "source" @@ -133,6 +150,13 @@ def test_info_from_wheel(demo_wheel: Path) -> None: assert info._source_url == demo_wheel.resolve().as_posix() +def test_info_from_memory_wheel(demo_memory_wheel: MemoryWheel) -> None: + info = PackageInfo.from_memory_wheel(demo_memory_wheel) + demo_check_info(info) + assert info._source_type is None + assert info._source_url is None + + def test_info_from_bdist(demo_wheel: Path) -> None: info = PackageInfo.from_bdist(demo_wheel) demo_check_info(info) diff --git a/tests/inspection/test_lazy_wheel.py b/tests/inspection/test_lazy_wheel.py new file mode 100644 index 00000000000..6c21b86a71a --- /dev/null +++ b/tests/inspection/test_lazy_wheel.py @@ -0,0 +1,188 @@ +from __future__ import annotations + +import re + +from typing import TYPE_CHECKING + +import httpretty +import pytest +import requests + +from requests import codes + +from poetry.inspection.lazy_wheel import HTTPRangeRequestUnsupported +from poetry.inspection.lazy_wheel import InvalidWheel +from poetry.inspection.lazy_wheel import memory_wheel_from_url +from tests.inspection.conftest import NEGATIVE_OFFSET_AS_POSITIVE + + +if TYPE_CHECKING: + from tests.inspection.conftest import RequestCallbackFactory + + +@pytest.mark.parametrize( + "negative_offset_error", + [ + None, + (codes.method_not_allowed, b"Method not allowed"), + (codes.requested_range_not_satisfiable, b"Requested range not satisfiable"), + (codes.not_implemented, b"Unsupported client range"), + (NEGATIVE_OFFSET_AS_POSITIVE, b"handle negative offset as positive"), + ], +) +def test_memory_wheel_from_url( + http: type[httpretty.httpretty], + handle_request_factory: RequestCallbackFactory, + negative_offset_error: tuple[int, bytes] | None, +) -> None: + domain = ( + f"lazy-wheel-{negative_offset_error[0] if negative_offset_error else 0}.com" + ) + uri_regex = re.compile(f"^https://{domain}/.*$") + request_callback = handle_request_factory( + negative_offset_error=negative_offset_error + ) + http.register_uri(http.GET, uri_regex, body=request_callback) + http.register_uri(http.HEAD, uri_regex, body=request_callback) + + url = f"https://{domain}/poetry_core-1.5.0-py3-none-any.whl" + + wheel = memory_wheel_from_url("poetry-core", url, requests.Session()) + + assert wheel.name == "poetry-core" + assert wheel.version == "1.5.0" + assert wheel.author == "Sébastien Eustace" + assert wheel.requires_dist == [ + 'importlib-metadata (>=1.7.0) ; python_version < "3.8"' + ] + + # negative offsets supported: + # 1. end of central directory + # 2. whole central directory + # 3. METADATA file + # negative offsets not supported: + # 1. failed range request + # 2. HEAD request + # 3.-5. see negative offsets 1.-3. + expected_requests = 3 + if negative_offset_error: + if negative_offset_error[0] in ( + codes.requested_range_not_satisfiable, + NEGATIVE_OFFSET_AS_POSITIVE, + ): + expected_requests += 1 + else: + expected_requests += 2 + latest_requests = http.latest_requests() + assert len(latest_requests) == expected_requests + + # second wheel -> one less request if negative offsets are not supported + latest_requests.clear() + memory_wheel_from_url("poetry-core", url, requests.Session()) + expected_requests = min(expected_requests, 4) + latest_requests = httpretty.latest_requests() + assert len(latest_requests) == expected_requests + + +@pytest.mark.parametrize("negative_offset_as_positive", [False, True]) +def test_memory_wheel_from_url_smaller_than_initial_chunk_size( + http: type[httpretty.httpretty], + handle_request_factory: RequestCallbackFactory, + negative_offset_as_positive: bool, +) -> None: + domain = f"tiny-wheel-{str(negative_offset_as_positive).lower()}.com" + uri_regex = re.compile(f"^https://{domain}/.*$") + request_callback = handle_request_factory( + negative_offset_error=( + (NEGATIVE_OFFSET_AS_POSITIVE, b"") if negative_offset_as_positive else None + ) + ) + http.register_uri(http.GET, uri_regex, body=request_callback) + http.register_uri(http.HEAD, uri_regex, body=request_callback) + + url = f"https://{domain}/zipp-3.5.0-py3-none-any.whl" + + wheel = memory_wheel_from_url("zipp", url, requests.Session()) + + assert wheel.name == "zipp" + assert wheel.version == "3.5.0" + assert wheel.author == "Jason R. Coombs" + assert len(wheel.requires_dist) == 12 + + # only one request because server gives a normal response with the entire wheel + # except for when server interprets negative offset as positive + latest_requests = http.latest_requests() + assert len(latest_requests) == 1 + + +@pytest.mark.parametrize("accept_ranges", [None, "none"]) +def test_memory_wheel_from_url_range_requests_not_supported_one_request( + http: type[httpretty.httpretty], + handle_request_factory: RequestCallbackFactory, + accept_ranges: str | None, +) -> None: + domain = "no-range-requests.com" + uri_regex = re.compile(f"^https://{domain}/.*$") + request_callback = handle_request_factory(accept_ranges=accept_ranges) + http.register_uri(http.GET, uri_regex, body=request_callback) + http.register_uri(http.HEAD, uri_regex, body=request_callback) + + url = f"https://{domain}/poetry_core-1.5.0-py3-none-any.whl" + + with pytest.raises(HTTPRangeRequestUnsupported): + memory_wheel_from_url("poetry-core", url, requests.Session()) + + latest_requests = http.latest_requests() + assert len(latest_requests) == 1 + assert latest_requests[0].method == "GET" + + +@pytest.mark.parametrize( + "negative_offset_error", + [ + (codes.method_not_allowed, b"Method not allowed"), + (codes.not_implemented, b"Unsupported client range"), + ], +) +def test_memory_wheel_from_url_range_requests_not_supported_two_requests( + http: type[httpretty.httpretty], + handle_request_factory: RequestCallbackFactory, + negative_offset_error: tuple[int, bytes], +) -> None: + domain = f"no-negative-offsets-{negative_offset_error[0]}.com" + uri_regex = re.compile(f"^https://{domain}/.*$") + request_callback = handle_request_factory( + accept_ranges=None, negative_offset_error=negative_offset_error + ) + http.register_uri(http.GET, uri_regex, body=request_callback) + http.register_uri(http.HEAD, uri_regex, body=request_callback) + + url = f"https://{domain}/poetry_core-1.5.0-py3-none-any.whl" + + with pytest.raises(HTTPRangeRequestUnsupported): + memory_wheel_from_url("poetry-core", url, requests.Session()) + + latest_requests = http.latest_requests() + assert len(latest_requests) == 2 + assert latest_requests[0].method == "GET" + assert latest_requests[1].method == "HEAD" + + +def test_memory_wheel_from_url_invalid_wheel( + http: type[httpretty.httpretty], + handle_request_factory: RequestCallbackFactory, +) -> None: + domain = "invalid-wheel.com" + uri_regex = re.compile(f"^https://{domain}/.*$") + request_callback = handle_request_factory() + http.register_uri(http.GET, uri_regex, body=request_callback) + http.register_uri(http.HEAD, uri_regex, body=request_callback) + + url = f"https://{domain}/demo_missing_dist_info-0.1.0-py2.py3-none-any.whl" + + with pytest.raises(InvalidWheel): + memory_wheel_from_url("demo-missing-dist-info", url, requests.Session()) + + latest_requests = http.latest_requests() + assert len(latest_requests) == 1 + assert latest_requests[0].method == "GET" diff --git a/tests/repositories/test_http_repository.py b/tests/repositories/test_http_repository.py new file mode 100644 index 00000000000..0eb0e9046e0 --- /dev/null +++ b/tests/repositories/test_http_repository.py @@ -0,0 +1,144 @@ +from __future__ import annotations + +import shutil + +from pathlib import Path +from typing import TYPE_CHECKING +from typing import Any + +import pytest + +from poetry.inspection.lazy_wheel import HTTPRangeRequestUnsupported +from poetry.inspection.lazy_wheel import MemoryWheel +from poetry.repositories.http_repository import HTTPRepository +from poetry.utils.helpers import HTTPRangeRequestSupported + + +if TYPE_CHECKING: + from packaging.utils import NormalizedName + from poetry.core.constraints.version import Version + from pytest_mock import MockerFixture + + +class MockRepository(HTTPRepository): + DIST_FIXTURES = Path(__file__).parent / "fixtures" / "pypi.org" / "dists" + + def __init__(self, lazy_wheel: bool = True) -> None: + super().__init__("foo", "https://foo.com") + self._lazy_wheel = lazy_wheel + + def _get_release_info( + self, name: NormalizedName, version: Version + ) -> dict[str, Any]: + raise NotImplementedError + + +@pytest.mark.parametrize("lazy_wheel", [False, True]) +@pytest.mark.parametrize("supports_range_requests", [None, False, True]) +def test_get_info_from_wheel( + mocker: MockerFixture, lazy_wheel: bool, supports_range_requests: bool | None +) -> None: + filename = "poetry_core-1.5.0-py3-none-any.whl" + filepath = MockRepository.DIST_FIXTURES / filename + + mock_memory_wheel_from_url = mocker.patch( + "poetry.repositories.http_repository.memory_wheel_from_url", + return_value=MemoryWheel(filepath), # type: ignore[arg-type] + ) + mock_download = mocker.patch( + "poetry.repositories.http_repository.download_file", + side_effect=lambda _, dest, *args, **kwargs: shutil.copy(filepath, dest), + ) + + domain = "foo.com" + url = f"https://{domain}/{filename}" + repo = MockRepository(lazy_wheel) + assert not repo._supports_range_requests + if lazy_wheel and supports_range_requests is not None: + repo._supports_range_requests[domain] = supports_range_requests + + info = repo._get_info_from_wheel(url) + assert info.name == "poetry-core" + assert info.version == "1.5.0" + assert info.requires_dist == [ + 'importlib-metadata (>=1.7.0) ; python_version < "3.8"' + ] + + if lazy_wheel and (supports_range_requests or supports_range_requests is None): + mock_memory_wheel_from_url.assert_called_once_with(filename, url, repo.session) + mock_download.assert_not_called() + assert repo._supports_range_requests[domain] is True + else: + mock_memory_wheel_from_url.assert_not_called() + mock_download.assert_called_once_with( + url, mocker.ANY, session=repo.session, raise_accepts_ranges=lazy_wheel + ) + if lazy_wheel: + assert repo._supports_range_requests[domain] is False + else: + assert domain not in repo._supports_range_requests + + +def test_get_info_from_wheel_state_sequence(mocker: MockerFixture) -> None: + """ + 1. We know nothing: + Try range requests, which are not supported and fall back to complete download. + 2. Range requests were not supported so far: + We do not try range requests again. + 3. Range requests were still not supported so far: + We do not try range requests again, but we notice that the response header + contains "Accept-Ranges: bytes", so range requests are at least supported + for some files, which means we want to try again. + 4. Range requests are supported for some files: + We try range requests (success). + 5. Range requests are supported for some files: + We try range requests (failure), but do not forget that range requests are + supported for some files. + 6. Range requests are supported for some files: + We try range requests (success). + """ + mock_memory_wheel_from_url = mocker.patch( + "poetry.repositories.http_repository.memory_wheel_from_url" + ) + mock_download = mocker.patch("poetry.repositories.http_repository.download_file") + + filename = "poetry_core-1.5.0-py3-none-any.whl" + domain = "foo.com" + url = f"https://{domain}/{filename}" + repo = MockRepository() + + # 1. range request and download + mock_memory_wheel_from_url.side_effect = HTTPRangeRequestUnsupported + repo._get_info_from_wheel(url) + assert mock_memory_wheel_from_url.call_count == 1 + assert mock_download.call_count == 1 + + # 2. only download + repo._get_info_from_wheel(url) + assert mock_memory_wheel_from_url.call_count == 1 + assert mock_download.call_count == 2 + + # 3. range request and download + mock_memory_wheel_from_url.side_effect = None + mock_download.side_effect = HTTPRangeRequestSupported + repo._get_info_from_wheel(url) + assert mock_memory_wheel_from_url.call_count == 2 + assert mock_download.call_count == 3 + + # 4. only range request + repo._get_info_from_wheel(url) + assert mock_memory_wheel_from_url.call_count == 3 + assert mock_download.call_count == 3 + + # 5. range request and download + mock_memory_wheel_from_url.side_effect = HTTPRangeRequestUnsupported + mock_download.side_effect = None + repo._get_info_from_wheel(url) + assert mock_memory_wheel_from_url.call_count == 4 + assert mock_download.call_count == 4 + + # 6. only range request + mock_memory_wheel_from_url.side_effect = None + repo._get_info_from_wheel(url) + assert mock_memory_wheel_from_url.call_count == 5 + assert mock_download.call_count == 4 diff --git a/tests/repositories/test_legacy_repository.py b/tests/repositories/test_legacy_repository.py index 3577836d420..71b831641fa 100644 --- a/tests/repositories/test_legacy_repository.py +++ b/tests/repositories/test_legacy_repository.py @@ -41,6 +41,7 @@ class MockRepository(LegacyRepository): def __init__(self) -> None: super().__init__("legacy", url="http://legacy.foo.bar", disable_cache=True) + self._lazy_wheel = False def _get_page(self, name: NormalizedName) -> SimpleRepositoryPage: fixture = self.FIXTURES / (name + ".html") @@ -50,7 +51,9 @@ def _get_page(self, name: NormalizedName) -> SimpleRepositoryPage: with fixture.open(encoding="utf-8") as f: return SimpleRepositoryPage(self._url + f"/{name}/", f.read()) - def _download(self, url: str, dest: Path) -> None: + def _download( + self, url: str, dest: Path, *, raise_accepts_ranges: bool = False + ) -> None: filename = Link(url).filename filepath = self.FIXTURES.parent / "pypi.org" / "dists" / filename diff --git a/tests/repositories/test_pypi_repository.py b/tests/repositories/test_pypi_repository.py index 9169430729e..a5a437efac8 100644 --- a/tests/repositories/test_pypi_repository.py +++ b/tests/repositories/test_pypi_repository.py @@ -38,6 +38,7 @@ class MockRepository(PyPiRepository): def __init__(self, fallback: bool = False) -> None: super().__init__(url="http://foo.bar", disable_cache=True, fallback=fallback) + self._lazy_wheel = False def get_json_page(self, name: NormalizedName) -> SimpleJsonPage: fixture = self.JSON_FIXTURES / (name + ".json") @@ -66,7 +67,9 @@ def _get( data: dict[str, Any] = json.load(f) return data - def _download(self, url: str, dest: Path) -> None: + def _download( + self, url: str, dest: Path, *, raise_accepts_ranges: bool = False + ) -> None: filename = url.split("/")[-1] fixture = self.DIST_FIXTURES / filename diff --git a/tests/repositories/test_single_page_repository.py b/tests/repositories/test_single_page_repository.py index cce758bb312..bbe3002c8e7 100644 --- a/tests/repositories/test_single_page_repository.py +++ b/tests/repositories/test_single_page_repository.py @@ -25,6 +25,7 @@ def __init__(self, page: str) -> None: url=f"http://single-page.foo.bar/{page}.html", disable_cache=True, ) + self._lazy_wheel = False def _get_page(self, name: NormalizedName) -> SimpleRepositoryPage: fixture = self.FIXTURES / self.url.rsplit("/", 1)[-1] @@ -34,7 +35,9 @@ def _get_page(self, name: NormalizedName) -> SimpleRepositoryPage: with fixture.open(encoding="utf-8") as f: return SimpleRepositoryPage(self._url, f.read()) - def _download(self, url: str, dest: Path) -> None: + def _download( + self, url: str, dest: Path, *, raise_accepts_ranges: bool = False + ) -> None: raise RuntimeError("Tests are not configured for downloads") diff --git a/tests/utils/test_helpers.py b/tests/utils/test_helpers.py index 84e1520a39b..2a6d2baf191 100644 --- a/tests/utils/test_helpers.py +++ b/tests/utils/test_helpers.py @@ -1,11 +1,13 @@ from __future__ import annotations from typing import TYPE_CHECKING +from typing import Any import pytest from poetry.core.utils.helpers import parse_requires +from poetry.utils.helpers import HTTPRangeRequestSupported from poetry.utils.helpers import download_file from poetry.utils.helpers import get_file_hash @@ -14,6 +16,7 @@ from pathlib import Path from httpretty import httpretty + from httpretty.core import HTTPrettyRequest from tests.types import FixtureDirGetter @@ -139,3 +142,35 @@ def test_download_file( expect_sha_256 = "9fa123ad707a5c6c944743bf3e11a0e80d86cb518d3cf25320866ca3ef43e2ad" assert get_file_hash(dest) == expect_sha_256 assert http.last_request().headers["Accept-Encoding"] == "Identity" + + +@pytest.mark.parametrize("accepts_ranges", [False, True]) +@pytest.mark.parametrize("raise_accepts_ranges", [False, True]) +def test_download_file_raise_accepts_ranges( + http: type[httpretty], + fixture_dir: FixtureDirGetter, + tmp_path: Path, + accepts_ranges: bool, + raise_accepts_ranges: bool, +) -> None: + filename = "demo-0.1.0-py2.py3-none-any.whl" + + def handle_request( + request: HTTPrettyRequest, uri: str, response_headers: dict[str, Any] + ) -> tuple[int, dict[str, Any], bytes]: + file_path = fixture_dir("distributions") / filename + if accepts_ranges: + response_headers["Accept-Ranges"] = "bytes" + return 200, response_headers, file_path.read_bytes() + + url = f"https://foo.com/{filename}" + http.register_uri(http.GET, url, body=handle_request) + dest = tmp_path / filename + + if accepts_ranges and raise_accepts_ranges: + with pytest.raises(HTTPRangeRequestSupported): + download_file(url, dest, raise_accepts_ranges=raise_accepts_ranges) + assert not dest.exists() + else: + download_file(url, dest, raise_accepts_ranges=raise_accepts_ranges) + assert dest.is_file()