diff --git a/news/11158.feature.rst b/news/11158.feature.rst new file mode 100644 index 00000000000..74436d7dccf --- /dev/null +++ b/news/11158.feature.rst @@ -0,0 +1 @@ +Support `PEP 691 `_. diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py index c79e2410c80..04646ae1121 100644 --- a/src/pip/_internal/index/collector.py +++ b/src/pip/_internal/index/collector.py @@ -6,6 +6,7 @@ import email.message import functools import itertools +import json import logging import os import re @@ -65,32 +66,46 @@ def _match_vcs_scheme(url: str) -> Optional[str]: return None -class _NotHTML(Exception): +class _NotAPIContent(Exception): def __init__(self, content_type: str, request_desc: str) -> None: super().__init__(content_type, request_desc) self.content_type = content_type self.request_desc = request_desc -def _ensure_html_header(response: Response) -> None: - """Check the Content-Type header to ensure the response contains HTML. +def _ensure_api_header(response: Response) -> None: + """ + Check the Content-Type header to ensure the response contains a Simple + API Response. - Raises `_NotHTML` if the content type is not text/html. + Raises `_NotAPIContent` if the content type is not a valid content-type. """ - content_type = response.headers.get("Content-Type", "") - if not content_type.lower().startswith("text/html"): - raise _NotHTML(content_type, response.request.method) + content_type = response.headers.get("Content-Type", "Unknown") + + content_type_l = content_type.lower() + if content_type_l.startswith( + ( + "text/html", + "application/vnd.pypi.simple.v1+html", + "application/vnd.pypi.simple.v1+json", + ) + ): + return + + raise _NotAPIContent(content_type, response.request.method) class _NotHTTP(Exception): pass -def _ensure_html_response(url: str, session: PipSession) -> None: - """Send a HEAD request to the URL, and ensure the response contains HTML. +def _ensure_api_response(url: str, session: PipSession) -> None: + """ + Send a HEAD request to the URL, and ensure the response contains a simple + API Response. Raises `_NotHTTP` if the URL is not available for a HEAD request, or - `_NotHTML` if the content type is not text/html. + `_NotAPIContent` if the content type is not a valid content type. """ scheme, netloc, path, query, fragment = urllib.parse.urlsplit(url) if scheme not in {"http", "https"}: @@ -99,31 +114,37 @@ def _ensure_html_response(url: str, session: PipSession) -> None: resp = session.head(url, allow_redirects=True) raise_for_status(resp) - _ensure_html_header(resp) + _ensure_api_header(resp) -def _get_html_response(url: str, session: PipSession) -> Response: - """Access an HTML page with GET, and return the response. +def _get_simple_response(url: str, session: PipSession) -> Response: + """Access an Simple API response with GET, and return the response. This consists of three parts: 1. If the URL looks suspiciously like an archive, send a HEAD first to - check the Content-Type is HTML, to avoid downloading a large file. - Raise `_NotHTTP` if the content type cannot be determined, or - `_NotHTML` if it is not HTML. + check the Content-Type is HTML or Simple API, to avoid downloading a + large file. Raise `_NotHTTP` if the content type cannot be determined, or + `_NotAPIContent` if it is not HTML or a Simple API. 2. Actually perform the request. Raise HTTP exceptions on network failures. - 3. Check the Content-Type header to make sure we got HTML, and raise - `_NotHTML` otherwise. + 3. Check the Content-Type header to make sure we got a Simple API response, + and raise `_NotAPIContent` otherwise. """ if is_archive_file(Link(url).filename): - _ensure_html_response(url, session=session) + _ensure_api_response(url, session=session) logger.debug("Getting page %s", redact_auth_from_url(url)) resp = session.get( url, headers={ - "Accept": "text/html", + "Accept": ", ".join( + [ + "application/vnd.pypi.simple.v1+json", + "application/vnd.pypi.simple.v1+html; q=0.1", + "text/html; q=0.01", + ] + ), # We don't want to blindly returned cached data for # /simple/, because authors generally expecting that # twine upload && pip install will function, but if @@ -145,9 +166,16 @@ def _get_html_response(url: str, session: PipSession) -> Response: # The check for archives above only works if the url ends with # something that looks like an archive. However that is not a # requirement of an url. Unless we issue a HEAD request on every - # url we cannot know ahead of time for sure if something is HTML - # or not. However we can check after we've downloaded it. - _ensure_html_header(resp) + # url we cannot know ahead of time for sure if something is a + # Simple API response or not. However we can check after we've + # downloaded it. + _ensure_api_header(resp) + + logger.debug( + "Fetched page %s as %s", + redact_auth_from_url(url), + resp.headers.get("Content-Type", "Unknown"), + ) return resp @@ -273,7 +301,7 @@ def _create_link_from_element( class CacheablePageContent: - def __init__(self, page: "HTMLPage") -> None: + def __init__(self, page: "IndexContent") -> None: assert page.cache_link_parsing self.page = page @@ -286,15 +314,15 @@ def __hash__(self) -> int: class ParseLinks(Protocol): def __call__( - self, page: "HTMLPage", use_deprecated_html5lib: bool + self, page: "IndexContent", use_deprecated_html5lib: bool ) -> Iterable[Link]: ... -def with_cached_html_pages(fn: ParseLinks) -> ParseLinks: +def with_cached_index_content(fn: ParseLinks) -> ParseLinks: """ - Given a function that parses an Iterable[Link] from an HTMLPage, cache the - function's result (keyed by CacheablePageContent), unless the HTMLPage + Given a function that parses an Iterable[Link] from an IndexContent, cache the + function's result (keyed by CacheablePageContent), unless the IndexContent `page` has `page.cache_link_parsing == False`. """ @@ -305,7 +333,9 @@ def wrapper( return list(fn(cacheable_page.page, use_deprecated_html5lib)) @functools.wraps(fn) - def wrapper_wrapper(page: "HTMLPage", use_deprecated_html5lib: bool) -> List[Link]: + def wrapper_wrapper( + page: "IndexContent", use_deprecated_html5lib: bool + ) -> List[Link]: if page.cache_link_parsing: return wrapper(CacheablePageContent(page), use_deprecated_html5lib) return list(fn(page, use_deprecated_html5lib)) @@ -313,7 +343,7 @@ def wrapper_wrapper(page: "HTMLPage", use_deprecated_html5lib: bool) -> List[Lin return wrapper_wrapper -def _parse_links_html5lib(page: "HTMLPage") -> Iterable[Link]: +def _parse_links_html5lib(page: "IndexContent") -> Iterable[Link]: """ Parse an HTML document, and yield its anchor elements as Link objects. @@ -338,12 +368,36 @@ def _parse_links_html5lib(page: "HTMLPage") -> Iterable[Link]: yield link -@with_cached_html_pages -def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Link]: +@with_cached_index_content +def parse_links(page: "IndexContent", use_deprecated_html5lib: bool) -> Iterable[Link]: """ - Parse an HTML document, and yield its anchor elements as Link objects. + Parse a Simple API's Index Content, and yield its anchor elements as Link objects. """ + content_type_l = page.content_type.lower() + if content_type_l.startswith("application/vnd.pypi.simple.v1+json"): + data = json.loads(page.content) + for file in data.get("files", []): + file_url = file.get("url") + if file_url is None: + continue + + # The Link.yanked_reason expects an empty string instead of a boolean. + yanked_reason = file.get("yanked") + if yanked_reason and not isinstance(yanked_reason, str): + yanked_reason = "" + # The Link.yanked_reason expects None instead of False + elif not yanked_reason: + yanked_reason = None + + yield Link( + _clean_link(urllib.parse.urljoin(page.url, file_url)), + comes_from=page.url, + requires_python=file.get("requires-python"), + yanked_reason=yanked_reason, + hashes=file.get("hashes", {}), + ) + if use_deprecated_html5lib: yield from _parse_links_html5lib(page) return @@ -365,12 +419,13 @@ def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Lin yield link -class HTMLPage: - """Represents one page, along with its URL""" +class IndexContent: + """Represents one response (or page), along with its URL""" def __init__( self, content: bytes, + content_type: str, encoding: Optional[str], url: str, cache_link_parsing: bool = True, @@ -383,6 +438,7 @@ def __init__( have this set to False, for example. """ self.content = content + self.content_type = content_type self.encoding = encoding self.url = url self.cache_link_parsing = cache_link_parsing @@ -419,7 +475,7 @@ def get_href(self, attrs: List[Tuple[str, Optional[str]]]) -> Optional[str]: return None -def _handle_get_page_fail( +def _handle_get_simple_fail( link: Link, reason: Union[str, Exception], meth: Optional[Callable[..., None]] = None, @@ -429,19 +485,22 @@ def _handle_get_page_fail( meth("Could not fetch URL %s: %s - skipping", link, reason) -def _make_html_page(response: Response, cache_link_parsing: bool = True) -> HTMLPage: +def _make_index_content( + response: Response, cache_link_parsing: bool = True +) -> IndexContent: encoding = _get_encoding_from_headers(response.headers) - return HTMLPage( + return IndexContent( response.content, + response.headers["Content-Type"], encoding=encoding, url=response.url, cache_link_parsing=cache_link_parsing, ) -def _get_html_page( +def _get_index_content( link: Link, session: Optional[PipSession] = None -) -> Optional["HTMLPage"]: +) -> Optional["IndexContent"]: if session is None: raise TypeError( "_get_html_page() missing 1 required keyword argument: 'session'" @@ -466,39 +525,44 @@ def _get_html_page( # final segment if not url.endswith("/"): url += "/" + # TODO: In the future, it would be nice if pip supported PEP 691 + # style respones in the file:// URLs, however there's no + # standard file extension for application/vnd.pypi.simple.v1+json + # so we'll need to come up with something on our own. url = urllib.parse.urljoin(url, "index.html") logger.debug(" file: URL is directory, getting %s", url) try: - resp = _get_html_response(url, session=session) + resp = _get_simple_response(url, session=session) except _NotHTTP: logger.warning( "Skipping page %s because it looks like an archive, and cannot " "be checked by a HTTP HEAD request.", link, ) - except _NotHTML as exc: + except _NotAPIContent as exc: logger.warning( - "Skipping page %s because the %s request got Content-Type: %s." - "The only supported Content-Type is text/html", + "Skipping page %s because the %s request got Content-Type: %s. " + "The only supported Content-Types are application/vnd.pypi.simple.v1+json, " + "application/vnd.pypi.simple.v1+html, and text/html", link, exc.request_desc, exc.content_type, ) except NetworkConnectionError as exc: - _handle_get_page_fail(link, exc) + _handle_get_simple_fail(link, exc) except RetryError as exc: - _handle_get_page_fail(link, exc) + _handle_get_simple_fail(link, exc) except SSLError as exc: reason = "There was a problem confirming the ssl certificate: " reason += str(exc) - _handle_get_page_fail(link, reason, meth=logger.info) + _handle_get_simple_fail(link, reason, meth=logger.info) except requests.ConnectionError as exc: - _handle_get_page_fail(link, f"connection error: {exc}") + _handle_get_simple_fail(link, f"connection error: {exc}") except requests.Timeout: - _handle_get_page_fail(link, "timed out") + _handle_get_simple_fail(link, "timed out") else: - return _make_html_page(resp, cache_link_parsing=link.cache_link_parsing) + return _make_index_content(resp, cache_link_parsing=link.cache_link_parsing) return None @@ -561,11 +625,11 @@ def create( def find_links(self) -> List[str]: return self.search_scope.find_links - def fetch_page(self, location: Link) -> Optional[HTMLPage]: + def fetch_response(self, location: Link) -> Optional[IndexContent]: """ Fetch an HTML page containing package links. """ - return _get_html_page(location, session=self.session) + return _get_index_content(location, session=self.session) def collect_sources( self, diff --git a/src/pip/_internal/index/package_finder.py b/src/pip/_internal/index/package_finder.py index f70f74b17c6..dbb6a64066c 100644 --- a/src/pip/_internal/index/package_finder.py +++ b/src/pip/_internal/index/package_finder.py @@ -792,11 +792,11 @@ def process_project_url( "Fetching project page and analyzing links: %s", project_url, ) - html_page = self._link_collector.fetch_page(project_url) - if html_page is None: + index_response = self._link_collector.fetch_response(project_url) + if index_response is None: return [] - page_links = list(parse_links(html_page, self._use_deprecated_html5lib)) + page_links = list(parse_links(index_response, self._use_deprecated_html5lib)) with indent_log(): package_links = self.evaluate_links( diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py index 6069b278b9b..8fd1c3d9960 100644 --- a/src/pip/_internal/models/link.py +++ b/src/pip/_internal/models/link.py @@ -4,7 +4,16 @@ import posixpath import re import urllib.parse -from typing import TYPE_CHECKING, Dict, List, NamedTuple, Optional, Tuple, Union +from typing import ( + TYPE_CHECKING, + Dict, + List, + Mapping, + NamedTuple, + Optional, + Tuple, + Union, +) from pip._internal.utils.filetypes import WHEEL_EXTENSION from pip._internal.utils.hashes import Hashes @@ -17,12 +26,14 @@ from pip._internal.utils.urls import path_to_url, url_to_path if TYPE_CHECKING: - from pip._internal.index.collector import HTMLPage + from pip._internal.index.collector import IndexContent logger = logging.getLogger(__name__) -_SUPPORTED_HASHES = ("sha1", "sha224", "sha384", "sha256", "sha512", "md5") +# Order matters, earlier hashes have a precedence over later hashes for what +# we will pick to use. +_SUPPORTED_HASHES = ("sha512", "sha384", "sha256", "sha224", "sha1", "md5") class Link(KeyBasedCompareMixin): @@ -31,6 +42,7 @@ class Link(KeyBasedCompareMixin): __slots__ = [ "_parsed_url", "_url", + "_hashes", "comes_from", "requires_python", "yanked_reason", @@ -40,14 +52,15 @@ class Link(KeyBasedCompareMixin): def __init__( self, url: str, - comes_from: Optional[Union[str, "HTMLPage"]] = None, + comes_from: Optional[Union[str, "IndexContent"]] = None, requires_python: Optional[str] = None, yanked_reason: Optional[str] = None, cache_link_parsing: bool = True, + hashes: Optional[Mapping[str, str]] = None, ) -> None: """ :param url: url of the resource pointed to (href of the link) - :param comes_from: instance of HTMLPage where the link was found, + :param comes_from: instance of IndexContent where the link was found, or string. :param requires_python: String containing the `Requires-Python` metadata field, specified in PEP 345. This may be specified by @@ -64,6 +77,8 @@ def __init__( should be cached. PyPI index urls should generally have this set to False, for example. + :param hashes: A mapping of hash names to digests to allow us to + determine the validity of a download. """ # url can be a UNC windows share @@ -74,6 +89,7 @@ def __init__( # Store the url as a private attribute to prevent accidentally # trying to set a new value. self._url = url + self._hashes = hashes if hashes is not None else {} self.comes_from = comes_from self.requires_python = requires_python if requires_python else None @@ -171,16 +187,26 @@ def subdirectory_fragment(self) -> Optional[str]: @property def hash(self) -> Optional[str]: + for hashname in _SUPPORTED_HASHES: + if hashname in self._hashes: + return self._hashes[hashname] + match = self._hash_re.search(self._url) if match: return match.group(2) + return None @property def hash_name(self) -> Optional[str]: + for hashname in _SUPPORTED_HASHES: + if hashname in self._hashes: + return hashname + match = self._hash_re.search(self._url) if match: return match.group(1) + return None @property diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index 868a13b03b2..eff2594cad9 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -1,4 +1,5 @@ import itertools +import json import logging import os import re @@ -13,15 +14,15 @@ from pip._internal.exceptions import NetworkConnectionError from pip._internal.index.collector import ( - HTMLPage, + IndexContent, LinkCollector, _clean_link, _clean_url_path, _determine_base_url, - _get_html_page, - _get_html_response, - _make_html_page, - _NotHTML, + _get_index_content, + _get_simple_response, + _make_index_content, + _NotAPIContent, _NotHTTP, parse_links, ) @@ -32,6 +33,14 @@ from pip._internal.network.session import PipSession from tests.lib import TestData, make_test_link_collector +ACCEPT = ", ".join( + [ + "application/vnd.pypi.simple.v1+json", + "application/vnd.pypi.simple.v1+html; q=0.1", + "text/html; q=0.01", + ] +) + @pytest.mark.parametrize( "url", @@ -40,13 +49,13 @@ "file:///opt/data/pip-18.0.tar.gz", ], ) -def test_get_html_response_archive_to_naive_scheme(url: str) -> None: +def test_get_simple_response_archive_to_naive_scheme(url: str) -> None: """ - `_get_html_response()` should error on an archive-like URL if the scheme + `_get_simple_response()` should error on an archive-like URL if the scheme does not allow "poking" without getting data. """ with pytest.raises(_NotHTTP): - _get_html_response(url, session=mock.Mock(PipSession)) + _get_simple_response(url, session=mock.Mock(PipSession)) @pytest.mark.parametrize( @@ -57,12 +66,12 @@ def test_get_html_response_archive_to_naive_scheme(url: str) -> None: ], ) @mock.patch("pip._internal.index.collector.raise_for_status") -def test_get_html_response_archive_to_http_scheme( +def test_get_simple_response_archive_to_http_scheme( mock_raise_for_status: mock.Mock, url: str, content_type: str ) -> None: """ - `_get_html_response()` should send a HEAD request on an archive-like URL - if the scheme supports it, and raise `_NotHTML` if the response isn't HTML. + `_get_simple_response()` should send a HEAD request on an archive-like URL + if the scheme supports it, and raise `_NotAPIContent` if the response isn't HTML. """ session = mock.Mock(PipSession) session.head.return_value = mock.Mock( @@ -72,8 +81,8 @@ def test_get_html_response_archive_to_http_scheme( } ) - with pytest.raises(_NotHTML) as ctx: - _get_html_response(url, session=session) + with pytest.raises(_NotAPIContent) as ctx: + _get_simple_response(url, session=session) session.assert_has_calls( [ @@ -91,10 +100,10 @@ def test_get_html_response_archive_to_http_scheme( ("file:///opt/data/pip-18.0.tar.gz"), ], ) -def test_get_html_page_invalid_content_type_archive( +def test_get_index_content_invalid_content_type_archive( caplog: pytest.LogCaptureFixture, url: str ) -> None: - """`_get_html_page()` should warn if an archive URL is not HTML + """`_get_index_content()` should warn if an archive URL is not HTML and therefore cannot be used for a HEAD request. """ caplog.set_level(logging.WARNING) @@ -102,7 +111,7 @@ def test_get_html_page_invalid_content_type_archive( session = mock.Mock(PipSession) - assert _get_html_page(link, session=session) is None + assert _get_index_content(link, session=session) is None assert ( "pip._internal.index.collector", logging.WARNING, @@ -119,11 +128,11 @@ def test_get_html_page_invalid_content_type_archive( ], ) @mock.patch("pip._internal.index.collector.raise_for_status") -def test_get_html_response_archive_to_http_scheme_is_html( +def test_get_simple_response_archive_to_http_scheme_is_html( mock_raise_for_status: mock.Mock, url: str ) -> None: """ - `_get_html_response()` should work with archive-like URLs if the HEAD + `_get_simple_response()` should work with archive-like URLs if the HEAD request is responded with text/html. """ session = mock.Mock(PipSession) @@ -135,7 +144,7 @@ def test_get_html_response_archive_to_http_scheme_is_html( ) session.get.return_value = mock.Mock(headers={"Content-Type": "text/html"}) - resp = _get_html_response(url, session=session) + resp = _get_simple_response(url, session=session) assert resp is not None assert session.mock_calls == [ @@ -143,7 +152,7 @@ def test_get_html_response_archive_to_http_scheme_is_html( mock.call.get( url, headers={ - "Accept": "text/html", + "Accept": ACCEPT, "Cache-Control": "max-age=0", }, ), @@ -163,9 +172,11 @@ def test_get_html_response_archive_to_http_scheme_is_html( ], ) @mock.patch("pip._internal.index.collector.raise_for_status") -def test_get_html_response_no_head(mock_raise_for_status: mock.Mock, url: str) -> None: +def test_get_simple_response_no_head( + mock_raise_for_status: mock.Mock, url: str +) -> None: """ - `_get_html_response()` shouldn't send a HEAD request if the URL does not + `_get_simple_response()` shouldn't send a HEAD request if the URL does not look like an archive, only the GET request that retrieves data. """ session = mock.Mock(PipSession) @@ -179,7 +190,7 @@ def test_get_html_response_no_head(mock_raise_for_status: mock.Mock, url: str) - ) ) - resp = _get_html_response(url, session=session) + resp = _get_simple_response(url, session=session) assert resp is not None assert session.head.call_count == 0 @@ -187,21 +198,22 @@ def test_get_html_response_no_head(mock_raise_for_status: mock.Mock, url: str) - mock.call( url, headers={ - "Accept": "text/html", + "Accept": ACCEPT, "Cache-Control": "max-age=0", }, ), - mock.call().headers.get("Content-Type", ""), + mock.call().headers.get("Content-Type", "Unknown"), + mock.call().headers.get("Content-Type", "Unknown"), ] mock_raise_for_status.assert_called_once_with(resp) @mock.patch("pip._internal.index.collector.raise_for_status") -def test_get_html_response_dont_log_clear_text_password( +def test_get_simple_response_dont_log_clear_text_password( mock_raise_for_status: mock.Mock, caplog: pytest.LogCaptureFixture ) -> None: """ - `_get_html_response()` should redact the password from the index URL + `_get_simple_response()` should redact the password from the index URL in its DEBUG log message. """ session = mock.Mock(PipSession) @@ -217,19 +229,24 @@ def test_get_html_response_dont_log_clear_text_password( caplog.set_level(logging.DEBUG) - resp = _get_html_response( + resp = _get_simple_response( "https://user:my_password@example.com/simple/", session=session ) assert resp is not None mock_raise_for_status.assert_called_once_with(resp) - assert len(caplog.records) == 1 + assert len(caplog.records) == 2 record = caplog.records[0] assert record.levelname == "DEBUG" assert record.message.splitlines() == [ "Getting page https://user:****@example.com/simple/", ] + record = caplog.records[1] + assert record.levelname == "DEBUG" + assert record.message.splitlines() == [ + "Fetched page https://user:****@example.com/simple/ as text/html", + ] @pytest.mark.parametrize( @@ -426,8 +443,9 @@ def _test_parse_links_data_attribute( "{}" ).format(anchor_html) html_bytes = html.encode("utf-8") - page = HTMLPage( + page = IndexContent( html_bytes, + "text/html", encoding=None, # parse_links() is cached by url, so we inject a random uuid to ensure # the page content isn't cached. @@ -464,6 +482,57 @@ def test_parse_links__requires_python( _test_parse_links_data_attribute(anchor_html, "requires_python", expected) +def test_parse_links_json() -> None: + json_bytes = json.dumps( + { + "meta": {"api-version": "1.0"}, + "name": "holygrail", + "files": [ + { + "filename": "holygrail-1.0.tar.gz", + "url": "https://example.com/files/holygrail-1.0.tar.gz", + "hashes": {"sha256": "sha256 hash", "blake2b": "blake2b hash"}, + "requires-python": ">=3.7", + "yanked": "Had a vulnerability", + }, + { + "filename": "holygrail-1.0-py3-none-any.whl", + "url": "/files/holygrail-1.0-py3-none-any.whl", + "hashes": {"sha256": "sha256 hash", "blake2b": "blake2b hash"}, + "requires-python": ">=3.7", + "dist-info-metadata": False, + }, + ], + } + ).encode("utf8") + page = IndexContent( + json_bytes, + "application/vnd.pypi.simple.v1+json", + encoding=None, + # parse_links() is cached by url, so we inject a random uuid to ensure + # the page content isn't cached. + url=f"https://example.com/simple-{uuid.uuid4()}/", + ) + links = list(parse_links(page, use_deprecated_html5lib=False)) + + assert links == [ + Link( + "https://example.com/files/holygrail-1.0.tar.gz", + comes_from=page.url, + requires_python=">=3.7", + yanked_reason="Had a vulnerability", + hashes={"sha256": "sha256 hash", "blake2b": "blake2b hash"}, + ), + Link( + "https://example.com/files/holygrail-1.0-py3-none-any.whl", + comes_from=page.url, + requires_python=">=3.7", + yanked_reason=None, + hashes={"sha256": "sha256 hash", "blake2b": "blake2b hash"}, + ), + ] + + @pytest.mark.parametrize( "anchor_html, expected", [ @@ -503,23 +572,26 @@ def test_parse_links_caches_same_page_by_url() -> None: url = "https://example.com/simple/" - page_1 = HTMLPage( + page_1 = IndexContent( html_bytes, + "text/html", encoding=None, url=url, ) # Make a second page with zero content, to ensure that it's not accessed, # because the page was cached by url. - page_2 = HTMLPage( + page_2 = IndexContent( b"", + "text/html", encoding=None, url=url, ) # Make a third page which represents an index url, which should not be # cached, even for the same url. We modify the page content slightly to # verify that the result is not cached. - page_3 = HTMLPage( + page_3 = IndexContent( re.sub(b"pkg1", b"pkg2", html_bytes), + "text/html", encoding=None, url=url, cache_link_parsing=False, @@ -541,7 +613,9 @@ def test_parse_links_caches_same_page_by_url() -> None: def test_parse_link_handles_deprecated_usage_properly() -> None: html = b'' url = "https://example.com/simple/" - page = HTMLPage(html, encoding=None, url=url, cache_link_parsing=False) + page = IndexContent( + html, "text/html", encoding=None, url=url, cache_link_parsing=False + ) parsed_links = list(parse_links(page, use_deprecated_html5lib=True)) @@ -559,7 +633,7 @@ def test_request_http_error( session = mock.Mock(PipSession) session.get.return_value = mock.Mock() mock_raise_for_status.side_effect = NetworkConnectionError("Http error") - assert _get_html_page(link, session=session) is None + assert _get_index_content(link, session=session) is None assert "Could not fetch URL http://localhost: Http error - skipping" in caplog.text @@ -568,11 +642,11 @@ def test_request_retries(caplog: pytest.LogCaptureFixture) -> None: link = Link("http://localhost") session = mock.Mock(PipSession) session.get.side_effect = requests.exceptions.RetryError("Retry error") - assert _get_html_page(link, session=session) is None + assert _get_index_content(link, session=session) is None assert "Could not fetch URL http://localhost: Retry error - skipping" in caplog.text -def test_make_html_page() -> None: +def test_make_index_content() -> None: headers = {"Content-Type": "text/html; charset=UTF-8"} response = mock.Mock( content=b"", @@ -580,7 +654,7 @@ def test_make_html_page() -> None: headers=headers, ) - actual = _make_html_page(response) + actual = _make_index_content(response) assert actual.content == b"" assert actual.encoding == "UTF-8" assert actual.url == "https://example.com/index.html" @@ -593,15 +667,15 @@ def test_make_html_page() -> None: ("git+https://github.com/pypa/pip.git", "git"), ], ) -def test_get_html_page_invalid_scheme( +def test_get_index_content_invalid_scheme( caplog: pytest.LogCaptureFixture, url: str, vcs_scheme: str ) -> None: - """`_get_html_page()` should error if an invalid scheme is given. + """`_get_index_content()` should error if an invalid scheme is given. Only file:, http:, https:, and ftp: are allowed. """ with caplog.at_level(logging.WARNING): - page = _get_html_page(Link(url), session=mock.Mock(PipSession)) + page = _get_index_content(Link(url), session=mock.Mock(PipSession)) assert page is None assert caplog.record_tuples == [ @@ -622,12 +696,12 @@ def test_get_html_page_invalid_scheme( ], ) @mock.patch("pip._internal.index.collector.raise_for_status") -def test_get_html_page_invalid_content_type( +def test_get_index_content_invalid_content_type( mock_raise_for_status: mock.Mock, caplog: pytest.LogCaptureFixture, content_type: str, ) -> None: - """`_get_html_page()` should warn if an invalid content-type is given. + """`_get_index_content()` should warn if an invalid content-type is given. Only text/html is allowed. """ caplog.set_level(logging.DEBUG) @@ -641,13 +715,14 @@ def test_get_html_page_invalid_content_type( "headers": {"Content-Type": content_type}, } ) - assert _get_html_page(link, session=session) is None + assert _get_index_content(link, session=session) is None mock_raise_for_status.assert_called_once_with(session.get.return_value) assert ( "pip._internal.index.collector", logging.WARNING, - "Skipping page {} because the GET request got Content-Type: {}." - "The only supported Content-Type is text/html".format(url, content_type), + "Skipping page {} because the GET request got Content-Type: {}. " + "The only supported Content-Types are application/vnd.pypi.simple.v1+json, " + "application/vnd.pypi.simple.v1+html, and text/html".format(url, content_type), ) in caplog.record_tuples @@ -664,11 +739,11 @@ def make_fake_html_response(url: str) -> mock.Mock: """ ) content = html.encode("utf-8") - return mock.Mock(content=content, url=url, headers={}) + return mock.Mock(content=content, url=url, headers={"Content-Type": "text/html"}) -def test_get_html_page_directory_append_index(tmpdir: Path) -> None: - """`_get_html_page()` should append "index.html" to a directory URL.""" +def test_get_index_content_directory_append_index(tmpdir: Path) -> None: + """`_get_index_content()` should append "index.html" to a directory URL.""" dirpath = tmpdir / "something" dirpath.mkdir() dir_url = dirpath.as_uri() @@ -676,10 +751,10 @@ def test_get_html_page_directory_append_index(tmpdir: Path) -> None: session = mock.Mock(PipSession) fake_response = make_fake_html_response(expected_url) - mock_func = mock.patch("pip._internal.index.collector._get_html_response") + mock_func = mock.patch("pip._internal.index.collector._get_simple_response") with mock_func as mock_func: mock_func.return_value = fake_response - actual = _get_html_page(Link(dir_url), session=session) + actual = _get_index_content(Link(dir_url), session=session) assert mock_func.mock_calls == [ mock.call(expected_url, session=session), ], f"actual calls: {mock_func.mock_calls}" @@ -779,16 +854,16 @@ def check_links_include(links: List[Link], names: List[str]) -> None: class TestLinkCollector: - @mock.patch("pip._internal.index.collector._get_html_response") - def test_fetch_page(self, mock_get_html_response: mock.Mock) -> None: + @mock.patch("pip._internal.index.collector._get_simple_response") + def test_fetch_response(self, mock_get_simple_response: mock.Mock) -> None: url = "https://pypi.org/simple/twine/" fake_response = make_fake_html_response(url) - mock_get_html_response.return_value = fake_response + mock_get_simple_response.return_value = fake_response location = Link(url, cache_link_parsing=False) link_collector = make_test_link_collector() - actual = link_collector.fetch_page(location) + actual = link_collector.fetch_response(location) assert actual is not None assert actual.content == fake_response.content @@ -797,8 +872,8 @@ def test_fetch_page(self, mock_get_html_response: mock.Mock) -> None: assert actual.cache_link_parsing == location.cache_link_parsing # Also check that the right session object was passed to - # _get_html_response(). - mock_get_html_response.assert_called_once_with( + # _get_simple_response(). + mock_get_simple_response.assert_called_once_with( url, session=link_collector.session, )