Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(experimental) Add partial-wheel-download functionality, to reduce time spent downloading wheels that are eventually discarded and allow parallel downloads #8448

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions news/8448.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add a set of utilities in ``pip._internal.network.shallow`` for fetching metadata from remote wheel files without downloading the entire file. Link these utilities into the v2 resolver by adding a new ShallowWheelDistribution AbstractDistribution subclass. Expose this behavior via a --unstable-feature=shallow_wheels command-line option to ``pip download``. This produces a marked performance improvement.
2 changes: 1 addition & 1 deletion src/pip/_internal/cli/cmdoptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -919,7 +919,7 @@ def check_list_path_option(options):
metavar='feature',
action='append',
default=[],
choices=['resolver'],
choices=['resolver', 'shallow_wheels'],
help=SUPPRESS_HELP, # TODO: Enable this when the resolver actually works.
# help='Enable unstable feature(s) that may be backward incompatible.',
) # type: Callable[..., Option]
Expand Down
19 changes: 16 additions & 3 deletions src/pip/_internal/cli/progress_bars.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from pip._internal.utils.typing import MYPY_CHECK_RUNNING

if MYPY_CHECK_RUNNING:
from typing import Any, Dict, List
from typing import Any, Dict, List, Optional

try:
from pip._vendor import colorama
Expand All @@ -24,6 +24,18 @@
colorama = None


def _signal_unless_backgrounded(signum, handler):
# type: (int, Any) -> Optional[Any]
try:
return signal(signum, handler)
except ValueError:
# FIXME: this otherwise doesn't work when called from a non-main
# thread. This therefore fails if we try to download more than one
# wheel at once via threading, which calls back to Downloader, which
# uses this progress bar.
return None


def _select_progress_class(preferred, fallback):
# type: (Bar, Bar) -> Bar
encoding = getattr(preferred.file, "encoding", None)
Expand Down Expand Up @@ -84,7 +96,8 @@ def __init__(self, *args, **kwargs):
**kwargs
)

self.original_handler = signal(SIGINT, self.handle_sigint)
self.original_handler = _signal_unless_backgrounded(
SIGINT, self.handle_sigint)

# If signal() returns None, the previous handler was not installed from
# Python, and we cannot restore it. This probably should not happen,
Expand All @@ -103,7 +116,7 @@ def finish(self):
normally, or gets interrupted.
"""
super(InterruptibleMixin, self).finish() # type: ignore
signal(SIGINT, self.original_handler)
_signal_unless_backgrounded(SIGINT, self.original_handler)

def handle_sigint(self, signum, frame): # type: ignore
"""
Expand Down
1 change: 1 addition & 0 deletions src/pip/_internal/cli/req_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@ def make_requirement_preparer(
finder=finder,
require_hashes=options.require_hashes,
use_user_site=use_user_site,
use_shallow_wheels=('shallow_wheels' in options.unstable_features),
)

@staticmethod
Expand Down
1 change: 1 addition & 0 deletions src/pip/_internal/commands/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ def run(self, options, args):
requirement_set = resolver.resolve(
reqs, check_supported_wheels=True
)
requirement_set.perform_all_final_hydration()

downloaded = ' '.join([req.name # type: ignore
for req in requirement_set.requirements.values()
Expand Down
1 change: 1 addition & 0 deletions src/pip/_internal/commands/install.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,7 @@ def run(self, options, args):
requirement_set = resolver.resolve(
reqs, check_supported_wheels=not options.target_dir
)
requirement_set.perform_all_final_hydration()

try:
pip_req = requirement_set.get_requirement("pip")
Expand Down
1 change: 1 addition & 0 deletions src/pip/_internal/commands/wheel.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ def run(self, options, args):
requirement_set = resolver.resolve(
reqs, check_supported_wheels=True
)
requirement_set.perform_all_final_hydration()

reqs_to_build = [
r for r in requirement_set.requirements.values()
Expand Down
100 changes: 100 additions & 0 deletions src/pip/_internal/distributions/shallow_wheel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import os

from pip._vendor.pkg_resources import DistInfoDistribution

from pip._internal.distributions.base import AbstractDistribution
from pip._internal.network.shallow.httpfile import Context as HttpContext
from pip._internal.network.shallow.httpfile import Url
from pip._internal.network.shallow.wheel import Context as WheelContext
from pip._internal.network.shallow.wheel import (
ProjectName,
WheelMetadataRequest,
)
from pip._internal.network.shallow.zipfile import Context as ZipContext
from pip._internal.utils.typing import MYPY_CHECK_RUNNING
from pip._internal.utils.wheel import WheelMetadata

if MYPY_CHECK_RUNNING:
from typing import Any
from pip._vendor.pkg_resources import Distribution
from pip._internal.index.package_finder import PackageFinder
from pip._internal.models.link import Link
from pip._internal.network.download import Downloader
from pip._internal.req import InstallRequirement


class DistributionNeedingFinalHydration(DistInfoDistribution):
def __init__(self, link, downloader, download_dir, *args, **kwargs):
# type: (Link, Downloader, str, Any, Any) -> None
super(DistributionNeedingFinalHydration, self).__init__(
*args, **kwargs)
self.final_link = link
self.downloader = downloader
self.download_dir = download_dir

def finally_hydrate(self):
# type: () -> None
download = self.downloader(self.final_link)
output_filename = os.path.join(self.download_dir, download.filename)
with open(output_filename, 'wb') as f:
for chunk in download.chunks:
f.write(chunk)


class ShallowWheelDistribution(AbstractDistribution):
"""Represents a wheel distribution.

This does not need any preparation as wheels can be directly unpacked.
"""

def __init__(self, req, downloader, download_dir):
# type: (InstallRequirement, Downloader, str) -> None
super(ShallowWheelDistribution, self).__init__(req)
self._downloader = downloader
self._download_dir = download_dir

@property
def _wheel_context(self):
# type: () -> WheelContext
http_ctx = HttpContext(self._downloader.get_session())
zip_ctx = ZipContext(http_ctx)
wheel_ctx = WheelContext(zip_ctx)
return wheel_ctx

def get_pkg_resources_distribution(self):
# type: () -> Distribution
"""Loads the metadata from the shallow wheel file into memory and
returns a Distribution that uses it, not relying on the wheel file or
requirement.
"""
# Wheels are never unnamed.
assert self.req.name
assert self.req.link

project_name = ProjectName(self.req.name)
remote_location = Url(self.req.link.url)

wheel_req = WheelMetadataRequest(
url=remote_location,
project_name=project_name,
)
metadata = (self
._wheel_context
.extract_wheel_metadata(wheel_req)
.contents)

wheel_filename = self.req.link.filename
wheel_metadata = WheelMetadata({'METADATA': metadata}, wheel_filename)

return DistributionNeedingFinalHydration(
link=self.req.link,
downloader=self._downloader,
download_dir=self._download_dir,
location=wheel_filename,
metadata=wheel_metadata,
project_name=project_name.name,
)

def prepare_distribution_metadata(self, finder, build_isolation):
# type: (PackageFinder, bool) -> None
pass
4 changes: 4 additions & 0 deletions src/pip/_internal/network/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,10 @@ def __init__(
self._session = session
self._progress_bar = progress_bar

def get_session(self):
# type: () -> PipSession
return self._session

def __call__(self, link):
# type: (Link) -> Download
try:
Expand Down
Empty file.
156 changes: 156 additions & 0 deletions src/pip/_internal/network/shallow/httpfile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
"""
Download ranges of files over remote http.
"""

from collections import namedtuple

from pip._vendor import requests

from pip._internal.utils.typing import MYPY_CHECK_RUNNING
from pip._internal.utils.urls import get_url_scheme

if MYPY_CHECK_RUNNING:
from typing import Any, Optional


def url_is_remote(url):
# type: (str) -> bool
return get_url_scheme(url) in ['http', 'https']


class Url(namedtuple('Url', ['url'])):

def __new__(cls, url):
# type: (str) -> Url
assert url_is_remote(url)
return super(Url, cls).__new__(cls, url)


class HttpFileRequest(namedtuple('HttpFileRequest', ['url'])):
pass


class Size(namedtuple('Size', ['size'])):
def __new__(cls, size=0):
# type: (int) -> Size
assert size >= 0
return super(Size, cls).__new__(cls, size)

def __add__(self, other):
# type: (Any) -> Size
assert isinstance(other, type(self))
return Size(self.size + other.size)

def __sub__(self, other):
# type: (Any) -> Size
assert isinstance(other, type(self))
return Size(self.size - other.size)

def __lt__(self, other):
# type: (Any) -> bool
assert isinstance(other, type(self))
return self.size < other.size

def __le__(self, other):
# type: (Any) -> bool
assert isinstance(other, type(self))
return self.size <= other.size

def __gt__(self, other):
# type: (Any) -> bool
assert isinstance(other, type(self))
return self.size > other.size

def __ge__(self, other):
# type: (Any) -> bool
assert isinstance(other, type(self))
return self.size >= other.size


class ByteRange(namedtuple('ByteRange', ['start', 'end'])):
def __new__(cls, start, end):
# type: (Size, Size) -> ByteRange
assert end >= start
return super(ByteRange, cls).__new__(cls, start, end)

def as_bytes_range_header(self):
# type: () -> str
return "bytes={start}-{end}".format(
start=self.start.size,
# NB: The byte ranges accepted here are inclusive, so remove one
# from the end.
end=(self.end.size - 1))

def size_diff(self):
# type: () -> Size
return self.end - self.start


class BytesRangeRequest(namedtuple('BytesRangeRequest', ['start', 'end'])):
def __new__(cls, start, end):
# type: (Optional[Size], Optional[Size]) -> BytesRangeRequest
if (start is not None) and (end is not None):
assert end >= start
return super(BytesRangeRequest, cls).__new__(cls, start, end)

def get_byte_range(self, size):
# type: (Size) -> ByteRange
if self.start is None:
start = 0
else:
assert self.start <= size, "???/start={start},size={size}".format(
start=self.start, size=size)
start = self.start.size

if self.end is None:
end = size.size
else:
assert self.end <= size
end = self.end.size

return ByteRange(start=Size(start), end=Size(end))


class HttpFile(namedtuple('HttpFile', ['url', 'size'])):
pass


class Context(object):

def __init__(self, session=None):
# type: (Optional[requests.Session]) -> None
self.session = session or requests.Session()

def head(self, request):
# type: (HttpFileRequest) -> HttpFile
resp = self.session.head(request.url.url)
resp.raise_for_status()
assert (
"bytes" in resp.headers["Accept-Ranges"]
), "???/bytes was not found in range headers"
content_length = int(resp.headers["Content-Length"])
return HttpFile(url=request.url, size=Size(content_length))

def range_request(self, http_file, request):
# type: (HttpFile, BytesRangeRequest) -> bytes
byte_range = request.get_byte_range(http_file.size)
resp = self.session.get(
http_file.url.url,
headers={"Range": byte_range.as_bytes_range_header()})
resp.raise_for_status()

if Size(len(resp.content)) == http_file.size:
# This request for the full URL contents is cached, and we should
# return just the requested byte range.
start = byte_range.start.size
end = byte_range.end.size
response_bytes = resp.content[start:end]
else:
response_bytes = resp.content

size_diff = byte_range.size_diff()
assert (
Size(len(response_bytes)) == size_diff
), ("???/response should have been length {}, but got (size {}):\n{!r}"
.format(size_diff, len(response_bytes), response_bytes))
return response_bytes
Loading