From 913757cb53058b8a37d7c07d96de839fe3b2e4e6 Mon Sep 17 00:00:00 2001 From: Nicolas Bock Date: Thu, 7 Feb 2019 12:46:37 -0700 Subject: [PATCH] Do not clean base_url When the `base_url` is a `[]` protected IPv6 address, the `_clean_link()` function converts `[` to `%5B` and `]` to `%5D`, which renders the `base_url` invalid. For example: ``` Starting new HTTP connection (1): fd00:0:0:236::100:8181 http://fd00:0:0:236::100:8181 "GET /os-releases/19.0.0.0b1/opensuse_leap-42.3-x86_64/requirements_absolute_requirements.txt HTTP/1.1" 200 None Setting setuptools==40.6.3 (from -c http://[fd00:0:0:236::100]:8181/os-releases/19.0.0.0b1/opensuse_leap-42.3-x86_64/requirements_absolute_requirements.txt (line 204)) extras to: () Looking in indexes: http://[fd00:0:0:236::100]:8181/simple Collecting setuptools==40.6.3 (from -c http://[fd00:0:0:236::100]:8181/os-releases/19.0.0.0b1/opensuse_leap-42.3-x86_64/requirements_absolute_requirements.txt (line 204)) 1 location(s) to search for versions of setuptools: * http://[fd00:0:0:236::100]:8181/simple/setuptools/ Getting page http://[fd00:0:0:236::100]:8181/simple/setuptools/ http://fd00:0:0:236::100:8181 "GET /simple/setuptools/ HTTP/1.1" 200 376 Analyzing links from page http://[fd00:0:0:236::100]:8181/simple/setuptools/ _package_versions: link = http://%5bfd00:0:0:236::100%5d:8181/packages/opensuse_leap-42.3-x86_64/setuptools/setuptools-40.6.3-py2.py3-none-any.whl#md5=389d3cd088d7afec3a1133b1d8e15df0 (from http://[fd00:0:0: 236::100]:8181/simple/setuptools/) _link_package_versions: link = http://%5bfd00:0:0:236::100%5d:8181/packages/opensuse_leap-42.3-x86_64/setuptools/setuptools-40.6.3-py2.py3-none-any.whl#md5=389d3cd088d7afec3a1133b1d8e15df0 (from http://[fd00 :0:0:236::100]:8181/simple/setuptools/) Found link http://%5bfd00:0:0:236::100%5d:8181/packages/opensuse_leap-42.3-x86_64/setuptools/setuptools-40.6.3-py2.py3-none-any.whl#md5=389d3cd088d7afec3a1133b1d8e15df0 (from http://[fd00:0:0:236::100]:8181/ simple/setuptools/), version: 40.6.3 Using version 40.6.3 (newest of versions: 40.6.3) Could not install packages due to an EnvironmentError. InvalidURL: Failed to parse: %5bfd00:0:0:236::100%5d:8181 ``` This change uses the vendored `urllib` library to split the host part off of the url before URL quoting only the path part. Fixes: #6285 Signed-off-by: Nicolas Bock --- news/6285.bugfix | 1 + src/pip/_internal/index.py | 21 ++++++++++--- tests/unit/test_index.py | 64 +++++++++++++++++++++++++++++++++++++- 3 files changed, 81 insertions(+), 5 deletions(-) create mode 100644 news/6285.bugfix diff --git a/news/6285.bugfix b/news/6285.bugfix new file mode 100644 index 00000000000..731300a047f --- /dev/null +++ b/news/6285.bugfix @@ -0,0 +1 @@ +Fix incorrect URL quoting of IPv6 addresses. diff --git a/src/pip/_internal/index.py b/src/pip/_internal/index.py index 8af659634de..a1ef4b6d59c 100644 --- a/src/pip/_internal/index.py +++ b/src/pip/_internal/index.py @@ -939,15 +939,28 @@ def _get_encoding_from_headers(headers): return None -_CLEAN_LINK_RE = re.compile(r'[^a-z0-9$&+,/:;=?@.#%_\\|-]', re.I) - - def _clean_link(url): # type: (str) -> str """Makes sure a link is fully encoded. That is, if a ' ' shows up in the link, it will be rewritten to %20 (while not over-quoting % or other characters).""" - return _CLEAN_LINK_RE.sub(lambda match: '%%%2x' % ord(match.group(0)), url) + # Split the URL into parts according to the general structure + # `scheme://netloc/path;parameters?query#fragment`. Note that the + # `netloc` can be empty and the URI will then refer to a local + # filesystem path. + result = urllib_parse.urlparse(url) + # In both cases below we unquote prior to quoting to make sure + # nothing is double quoted. + if result.netloc == "": + # On Windows the path part might contain a drive letter which + # should not be quoted. On Linux where drive letters do not + # exist, the colon should be quoted. We rely on urllib.request + # to do the right thing here. + path = urllib_request.pathname2url( + urllib_request.url2pathname(result.path)) + else: + path = urllib_parse.quote(urllib_parse.unquote(result.path)) + return urllib_parse.urlunparse(result._replace(path=path)) class HTMLPage(object): diff --git a/tests/unit/test_index.py b/tests/unit/test_index.py index e46d49531a2..bb12727e13e 100644 --- a/tests/unit/test_index.py +++ b/tests/unit/test_index.py @@ -7,7 +7,7 @@ from pip._internal.download import PipSession from pip._internal.index import ( - Link, PackageFinder, _determine_base_url, _egg_info_matches, + Link, PackageFinder, _clean_link, _determine_base_url, _egg_info_matches, _find_name_version_sep, _get_html_page, ) @@ -280,3 +280,65 @@ def test_request_retries(caplog): 'Could not fetch URL http://localhost: Retry error - skipping' in caplog.text ) + + +@pytest.mark.parametrize( + ("url", "clean_url"), + [ + # URL with hostname and port. Port separator should not be quoted. + ("https://localhost.localdomain:8181/path/with space/", + "https://localhost.localdomain:8181/path/with%20space/"), + # URL that is already properly quoted. The quoting `%` + # characters should not be quoted again. + ("https://localhost.localdomain:8181/path/with%20quoted%20space/", + "https://localhost.localdomain:8181/path/with%20quoted%20space/"), + # URL with IPv4 address and port. + ("https://127.0.0.1:8181/path/with space/", + "https://127.0.0.1:8181/path/with%20space/"), + # URL with IPv6 address and port. The `[]` brackets around the + # IPv6 address should not be quoted. + ("https://[fd00:0:0:236::100]:8181/path/with space/", + "https://[fd00:0:0:236::100]:8181/path/with%20space/"), + # URL with query. The leading `?` should not be quoted. + ("https://localhost.localdomain:8181/path/with/query?request=test", + "https://localhost.localdomain:8181/path/with/query?request=test"), + # URL with colon in the path portion. + ("https://localhost.localdomain:8181/path:/with:/colon", + "https://localhost.localdomain:8181/path%3A/with%3A/colon"), + # URL with something that looks like a drive letter, but is + # not. The `:` should be quoted. + ("https://localhost.localdomain/T:/path/", + "https://localhost.localdomain/T%3A/path/") + ] +) +def test_clean_link(url, clean_url): + assert(_clean_link(url) == clean_url) + + +@pytest.mark.parametrize( + ("url", "clean_url"), + [ + # URL with Windows drive letter. The `:` after the drive + # letter should not be quoted. The trailing `/` should be + # removed. + ("file:///T:/path/with spaces/", + "file:///T:/path/with%20spaces") + ] +) +@pytest.mark.skipif("sys.platform != 'win32'") +def test_clean_link_windows(url, clean_url): + assert(_clean_link(url) == clean_url) + + +@pytest.mark.parametrize( + ("url", "clean_url"), + [ + # URL with Windows drive letter, running on non-windows + # platform. The `:` after the drive should be quoted. + ("file:///T:/path/with spaces/", + "file:///T%3A/path/with%20spaces/") + ] +) +@pytest.mark.skipif("sys.platform == 'win32'") +def test_clean_link_non_windows(url, clean_url): + assert(_clean_link(url) == clean_url)