From ea3868f5096c373e7e18bcc8ef41a3aec9ad5e53 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Thu, 26 Sep 2024 15:25:54 -0500 Subject: [PATCH 01/19] Fix round-trip of IPv6 addresses The brackets were missing when the URL was convert to a string fixes #1157 --- tests/test_url_parsing.py | 15 +++++++++++++++ yarl/_url.py | 38 +++++++++++++++++++++++++++++--------- 2 files changed, 44 insertions(+), 9 deletions(-) diff --git a/tests/test_url_parsing.py b/tests/test_url_parsing.py index 4fe95185e..b873a3140 100644 --- a/tests/test_url_parsing.py +++ b/tests/test_url_parsing.py @@ -604,3 +604,18 @@ def test_schemes_that_require_host(scheme: str) -> None: ) with pytest.raises(ValueError, match=expect): URL(f"{scheme}://:1") + + +@pytest.mark.parametrize( + ("url", "hostname"), + [("http://[::1]", "[::1]"), ("http://[::1]:8080", "[::1]")], +) +def test_ipv6_url_round_trips(url: str, hostname: str) -> None: + """Verify that IPv6 URLs round-trip correctly.""" + parsed = URL(url) + hostname_without_brackets = hostname[1:-1] + assert parsed._val.hostname == hostname_without_brackets + assert parsed.raw_host == hostname_without_brackets + assert parsed.literal_host == hostname + assert str(parsed) == url + assert str(URL(str(parsed))) == url diff --git a/yarl/_url.py b/yarl/_url.py index 26da688ac..35a69ba26 100644 --- a/yarl/_url.py +++ b/yarl/_url.py @@ -395,7 +395,7 @@ def __str__(self) -> str: netloc=self._make_netloc( self.raw_user, self.raw_password, - self.raw_host, + self.literal_host, port, encode_host=False, ) @@ -647,6 +647,8 @@ def raw_host(self) -> Union[str, None]: None for relative URLs. + For literal IPv6 addresses, use the literal_host property instead + as it will return the host part with brackets. """ # Use host instead of hostname for sake of shortness # May add .hostname prop later @@ -660,16 +662,33 @@ def host(self) -> Union[str, None]: None for relative URLs. """ - raw = self.raw_host - if raw is None: + if (raw := self.raw_host) is None: return None - if "%" in raw: - # Hack for scoped IPv6 addresses like - # fe80::2%Перевірка - # presence of '%' sign means only IPv6 address, so idna is useless. + if raw and (":" in raw or raw[-1].isdigit()): + # IP addresses are never IDNA encoded return raw return _idna_decode(raw) + @cached_property + def literal_host(self) -> Union[str, None]: + """Return the literal host part of URL. + + None for relative URLs. + + https://datatracker.ietf.org/doc/html/rfc2732#section-2 + + Examples: + 'http://example.com:8080' -> 'example.com' + 'http://example.com:80' -> 'example.com' + 'https://127.0.0.1:8443' -> '127.0.0.1' + 'https://[::1]:8443' -> '[::1]' + 'http://[::1]' -> '[::1]' + + """ + if (raw := self.raw_host) is None: + return None + return f"[{raw}]" if ":" in raw else raw + @cached_property def port(self) -> Union[int, None]: """Port part of URL, with scheme-based fallback. @@ -945,7 +964,7 @@ def _encode_host(cls, host: str, human: bool = False) -> str: raw_ip = host sep = zone = "" - if raw_ip and raw_ip[-1].isdigit() or ":" in raw_ip: + if raw_ip and (":" in raw_ip or raw_ip[-1].isdigit()): # Might be an IP address, check it # # IP Addresses can look like: @@ -953,7 +972,8 @@ def _encode_host(cls, host: str, human: bool = False) -> str: # - 127.0.0.1 (last character is a digit) # - 2001:db8::ff00:42:8329 (contains a colon) # - 2001:db8::ff00:42:8329%eth0 (contains a colon) - # - [2001:db8::ff00:42:8329] (contains a colon) + # - [2001:db8::ff00:42:8329] (contains a colon -- brackets should + # have been removed before it gets here) # Rare IP Address formats are not supported per: # https://datatracker.ietf.org/doc/html/rfc3986#section-7.4 # From 1776185b74b1ab39d9b2ca147750b78fdaffd312 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Thu, 26 Sep 2024 15:31:04 -0500 Subject: [PATCH 02/19] Update yarl/_url.py --- yarl/_url.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/yarl/_url.py b/yarl/_url.py index 35a69ba26..4f23cec6c 100644 --- a/yarl/_url.py +++ b/yarl/_url.py @@ -676,6 +676,9 @@ def literal_host(self) -> Union[str, None]: None for relative URLs. https://datatracker.ietf.org/doc/html/rfc2732#section-2 + https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.2 + + `IP-literal = "[" ( IPv6address / IPvFuture ) "]"` Examples: 'http://example.com:8080' -> 'example.com' From d4323773b7c550f1705b338e377f2da87616e839 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 26 Sep 2024 20:32:13 +0000 Subject: [PATCH 03/19] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- yarl/_url.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yarl/_url.py b/yarl/_url.py index 4f23cec6c..cd93a12da 100644 --- a/yarl/_url.py +++ b/yarl/_url.py @@ -677,7 +677,7 @@ def literal_host(self) -> Union[str, None]: https://datatracker.ietf.org/doc/html/rfc2732#section-2 https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.2 - + `IP-literal = "[" ( IPv6address / IPvFuture ) "]"` Examples: From 056680279afc6f7999b9801ac86d185875fe05a4 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Thu, 26 Sep 2024 15:33:26 -0500 Subject: [PATCH 04/19] host_subcomponent --- tests/test_url_parsing.py | 2 +- yarl/_url.py | 19 +++++++++---------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/tests/test_url_parsing.py b/tests/test_url_parsing.py index b873a3140..622e93f2c 100644 --- a/tests/test_url_parsing.py +++ b/tests/test_url_parsing.py @@ -616,6 +616,6 @@ def test_ipv6_url_round_trips(url: str, hostname: str) -> None: hostname_without_brackets = hostname[1:-1] assert parsed._val.hostname == hostname_without_brackets assert parsed.raw_host == hostname_without_brackets - assert parsed.literal_host == hostname + assert parsed.host_subcomponent == hostname assert str(parsed) == url assert str(URL(str(parsed))) == url diff --git a/yarl/_url.py b/yarl/_url.py index 4f23cec6c..609cb9b27 100644 --- a/yarl/_url.py +++ b/yarl/_url.py @@ -395,7 +395,7 @@ def __str__(self) -> str: netloc=self._make_netloc( self.raw_user, self.raw_password, - self.literal_host, + self.host_subcomponent, port, encode_host=False, ) @@ -670,22 +670,21 @@ def host(self) -> Union[str, None]: return _idna_decode(raw) @cached_property - def literal_host(self) -> Union[str, None]: - """Return the literal host part of URL. + def host_subcomponent(self) -> Union[str, None]: + """Return the host subcomponent part of URL. None for relative URLs. - https://datatracker.ietf.org/doc/html/rfc2732#section-2 https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.2 - + `IP-literal = "[" ( IPv6address / IPvFuture ) "]"` Examples: - 'http://example.com:8080' -> 'example.com' - 'http://example.com:80' -> 'example.com' - 'https://127.0.0.1:8443' -> '127.0.0.1' - 'https://[::1]:8443' -> '[::1]' - 'http://[::1]' -> '[::1]' + - `http://example.com:8080` -> `example.com` + - `http://example.com:80` -> `example.com` + - `https://127.0.0.1:8443` -> `127.0.0.1` + - `https://[::1]:8443` -> `[::1]` + - `http://[::1]` -> `[::1]` """ if (raw := self.raw_host) is None: From 4a85c17bb0dfbfc44486466a30ee57a89d996422 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Thu, 26 Sep 2024 15:37:50 -0500 Subject: [PATCH 05/19] tests --- tests/test_url_parsing.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/tests/test_url_parsing.py b/tests/test_url_parsing.py index 622e93f2c..92065b5f0 100644 --- a/tests/test_url_parsing.py +++ b/tests/test_url_parsing.py @@ -607,13 +607,23 @@ def test_schemes_that_require_host(scheme: str) -> None: @pytest.mark.parametrize( - ("url", "hostname"), - [("http://[::1]", "[::1]"), ("http://[::1]:8080", "[::1]")], + ("url", "hostname", "hostname_without_brackets"), + [ + ("http://[::1]", "[::1]", "::1"), + ("http://[::1]:8080", "[::1]", "::1"), + ("http://127.0.0.1:8080", "127.0.0.1", "127.0.0.1"), + ( + "http://xn--jxagkqfkduily1i.eu", + "xn--jxagkqfkduily1i.eu", + "xn--jxagkqfkduily1i.eu", + ), + ], ) -def test_ipv6_url_round_trips(url: str, hostname: str) -> None: +def test_ipv6_url_round_trips( + url: str, hostname: str, hostname_without_brackets: str +) -> None: """Verify that IPv6 URLs round-trip correctly.""" parsed = URL(url) - hostname_without_brackets = hostname[1:-1] assert parsed._val.hostname == hostname_without_brackets assert parsed.raw_host == hostname_without_brackets assert parsed.host_subcomponent == hostname From 9c27548e541b0ffeaa32151980697f4923c3ac61 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Thu, 26 Sep 2024 15:39:56 -0500 Subject: [PATCH 06/19] Update yarl/_url.py --- yarl/_url.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yarl/_url.py b/yarl/_url.py index 609cb9b27..fdbb5c009 100644 --- a/yarl/_url.py +++ b/yarl/_url.py @@ -647,7 +647,7 @@ def raw_host(self) -> Union[str, None]: None for relative URLs. - For literal IPv6 addresses, use the literal_host property instead + For literal IPv6 addresses, use the host_subcomponent property instead as it will return the host part with brackets. """ # Use host instead of hostname for sake of shortness From 31da379f2da5f77ac5149c263bc91b1ed8073495 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Thu, 26 Sep 2024 15:40:50 -0500 Subject: [PATCH 07/19] Apply suggestions from code review --- yarl/_url.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yarl/_url.py b/yarl/_url.py index fdbb5c009..7897b52b1 100644 --- a/yarl/_url.py +++ b/yarl/_url.py @@ -647,8 +647,8 @@ def raw_host(self) -> Union[str, None]: None for relative URLs. - For literal IPv6 addresses, use the host_subcomponent property instead - as it will return the host part with brackets. + When working with IPv6 addresses, use the `host_subcomponent` property instead + as it will return the host subcomponent with brackets. """ # Use host instead of hostname for sake of shortness # May add .hostname prop later From 86a0677de368719d059d4efe70aec7c2bd97aa7b Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Thu, 26 Sep 2024 15:41:37 -0500 Subject: [PATCH 08/19] Update yarl/_url.py --- yarl/_url.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yarl/_url.py b/yarl/_url.py index 7897b52b1..1fd6c7c6b 100644 --- a/yarl/_url.py +++ b/yarl/_url.py @@ -966,7 +966,7 @@ def _encode_host(cls, host: str, human: bool = False) -> str: raw_ip = host sep = zone = "" - if raw_ip and (":" in raw_ip or raw_ip[-1].isdigit()): + if raw_ip and (raw_ip[-1].isdigit() or ":" in raw_ip): # Might be an IP address, check it # # IP Addresses can look like: From 6401a10abb119285a8e6ab4820cfe0c68bf5523d Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Thu, 26 Sep 2024 15:41:58 -0500 Subject: [PATCH 09/19] Update yarl/_url.py --- yarl/_url.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yarl/_url.py b/yarl/_url.py index 1fd6c7c6b..20a73f891 100644 --- a/yarl/_url.py +++ b/yarl/_url.py @@ -966,7 +966,7 @@ def _encode_host(cls, host: str, human: bool = False) -> str: raw_ip = host sep = zone = "" - if raw_ip and (raw_ip[-1].isdigit() or ":" in raw_ip): + if raw_ip and raw_ip[-1].isdigit() or ":" in raw_ip: # Might be an IP address, check it # # IP Addresses can look like: From 59dbfe4852938198b3e30d302d83861f6cf2847b Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Thu, 26 Sep 2024 15:42:14 -0500 Subject: [PATCH 10/19] Update yarl/_url.py --- yarl/_url.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yarl/_url.py b/yarl/_url.py index 20a73f891..37e4739fa 100644 --- a/yarl/_url.py +++ b/yarl/_url.py @@ -664,7 +664,7 @@ def host(self) -> Union[str, None]: """ if (raw := self.raw_host) is None: return None - if raw and (":" in raw or raw[-1].isdigit()): + if raw and raw[-1].isdigit() or ":" in raw: # IP addresses are never IDNA encoded return raw return _idna_decode(raw) From e55083fd218d899252ed1a6c5263e6a3e264ff5a Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Thu, 26 Sep 2024 15:44:50 -0500 Subject: [PATCH 11/19] Update yarl/_url.py --- yarl/_url.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yarl/_url.py b/yarl/_url.py index 37e4739fa..a61a7f1d2 100644 --- a/yarl/_url.py +++ b/yarl/_url.py @@ -395,7 +395,7 @@ def __str__(self) -> str: netloc=self._make_netloc( self.raw_user, self.raw_password, - self.host_subcomponent, + self.raw_host, port, encode_host=False, ) From f8093ea7f4f7148478939e5353722e3dd34b9653 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Thu, 26 Sep 2024 15:48:06 -0500 Subject: [PATCH 12/19] tweaks --- tests/test_url.py | 12 ++++++++++++ tests/test_url_parsing.py | 25 ------------------------- 2 files changed, 12 insertions(+), 25 deletions(-) diff --git a/tests/test_url.py b/tests/test_url.py index 1dbdd80a0..b9f82b58a 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -176,6 +176,18 @@ def test_raw_host(): assert url.raw_host == url._val.hostname +@pytest.mark.parametrize( + ("host"), + [ + ("example.com"), + ("[::1]"), + ], +) +def test_host_subcomponent(host: str): + url = URL(f"http://{host}") + assert url.host_subcomponent == host + + def test_raw_host_non_ascii(): url = URL("http://оун-упа.укр") assert "xn----8sb1bdhvc.xn--j1amh" == url.raw_host diff --git a/tests/test_url_parsing.py b/tests/test_url_parsing.py index 92065b5f0..4fe95185e 100644 --- a/tests/test_url_parsing.py +++ b/tests/test_url_parsing.py @@ -604,28 +604,3 @@ def test_schemes_that_require_host(scheme: str) -> None: ) with pytest.raises(ValueError, match=expect): URL(f"{scheme}://:1") - - -@pytest.mark.parametrize( - ("url", "hostname", "hostname_without_brackets"), - [ - ("http://[::1]", "[::1]", "::1"), - ("http://[::1]:8080", "[::1]", "::1"), - ("http://127.0.0.1:8080", "127.0.0.1", "127.0.0.1"), - ( - "http://xn--jxagkqfkduily1i.eu", - "xn--jxagkqfkduily1i.eu", - "xn--jxagkqfkduily1i.eu", - ), - ], -) -def test_ipv6_url_round_trips( - url: str, hostname: str, hostname_without_brackets: str -) -> None: - """Verify that IPv6 URLs round-trip correctly.""" - parsed = URL(url) - assert parsed._val.hostname == hostname_without_brackets - assert parsed.raw_host == hostname_without_brackets - assert parsed.host_subcomponent == hostname - assert str(parsed) == url - assert str(URL(str(parsed))) == url From cd4505b35e255ec82ab810fbbcce2e680c4550af Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Thu, 26 Sep 2024 15:52:57 -0500 Subject: [PATCH 13/19] docs --- CHANGES/1159.feature.rst | 3 +++ docs/api.rst | 15 +++++++++++++++ 2 files changed, 18 insertions(+) create mode 100644 CHANGES/1159.feature.rst diff --git a/CHANGES/1159.feature.rst b/CHANGES/1159.feature.rst new file mode 100644 index 000000000..25d06f550 --- /dev/null +++ b/CHANGES/1159.feature.rst @@ -0,0 +1,3 @@ +Added :attr:`~yarl.URL.host_subcomponent` which returns the :rfc:`3986#section-3.2.2` host subcomponent -- by :user:`bdraco`. + +The only current practical difference between :attr:`~yarl.URL.raw_host` and :attr:`~yarl.URL.host_subcomponent` is that IPv6 addresses are returned bracketed. diff --git a/docs/api.rst b/docs/api.rst index b552e4f41..ea8ab3ed4 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -191,7 +191,22 @@ There are two kinds of properties: *decoded* and *encoded* (with >>> URL('http://хост.домен').raw_host 'xn--n1agdj.xn--d1acufc' + >>> URL('http://[::1]').raw_host + '::1' + +.. attribute:: URL.host_subcomponent + + :rfc:`3986#section-3.2.2` host subcomponent part of URL, ``None`` for relative URLs + (:ref:`yarl-api-relative-urls`). + + .. doctest:: + + >>> URL('http://хост.домен').host_subcomponent + 'xn--n1agdj.xn--d1acufc' + >>> URL('http://[::1]').host_subcomponent + '[::1]' + .. versionadded:: 1.13.0 .. attribute:: URL.port From 7d405978be273aa6dacb95b053ff0b0106e453b5 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Thu, 26 Sep 2024 15:54:07 -0500 Subject: [PATCH 14/19] Update docs/api.rst --- docs/api.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/api.rst b/docs/api.rst index ea8ab3ed4..0e4595b12 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -206,7 +206,7 @@ There are two kinds of properties: *decoded* and *encoded* (with >>> URL('http://[::1]').host_subcomponent '[::1]' - .. versionadded:: 1.13.0 + .. versionadded:: 1.13 .. attribute:: URL.port From 946405ef387bb0342caf7150d92c7ca34f37b2d1 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Thu, 26 Sep 2024 16:02:56 -0500 Subject: [PATCH 15/19] lint --- docs/spelling_wordlist.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt index 4714f0af5..6b90d22e4 100644 --- a/docs/spelling_wordlist.txt +++ b/docs/spelling_wordlist.txt @@ -42,6 +42,7 @@ runtimes sdist subclass subclasses +subcomponent svetlov uncompiled v1 From 918da42060ef99be58320206874ed7dcd0595c38 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Thu, 26 Sep 2024 16:11:44 -0500 Subject: [PATCH 16/19] more coverage --- tests/test_url.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/test_url.py b/tests/test_url.py index b9f82b58a..0e13a3cd8 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -181,6 +181,8 @@ def test_raw_host(): [ ("example.com"), ("[::1]"), + ("g%c3%bcnter.com"), + ("xn--gnter-4ya.com"), ], ) def test_host_subcomponent(host: str): @@ -199,6 +201,11 @@ def test_host_non_ascii(): assert "оун-упа.укр" == url.host +def test_host_non_ascii_percent_encoded(): + url = URL("g%c3%bcnter.com") + assert url.host == "günter.com" + + def test_localhost(): url = URL("http://[::1]") assert "::1" == url.host From d5923f03142ba3cc79e3a78fb3effa6ba274435b Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Thu, 26 Sep 2024 16:17:02 -0500 Subject: [PATCH 17/19] cleanup --- tests/test_url.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_url.py b/tests/test_url.py index 0e13a3cd8..886824a3f 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -188,6 +188,7 @@ def test_raw_host(): def test_host_subcomponent(host: str): url = URL(f"http://{host}") assert url.host_subcomponent == host + assert url._val.hostname == url.host_subcomponent def test_raw_host_non_ascii(): From d9332cc5361119a22c7eaade74d9953ae7c5a4e3 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Thu, 26 Sep 2024 16:20:27 -0500 Subject: [PATCH 18/19] cleanup --- tests/test_url.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tests/test_url.py b/tests/test_url.py index 886824a3f..7d41007d3 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -181,14 +181,12 @@ def test_raw_host(): [ ("example.com"), ("[::1]"), - ("g%c3%bcnter.com"), ("xn--gnter-4ya.com"), ], ) def test_host_subcomponent(host: str): url = URL(f"http://{host}") assert url.host_subcomponent == host - assert url._val.hostname == url.host_subcomponent def test_raw_host_non_ascii(): @@ -202,11 +200,6 @@ def test_host_non_ascii(): assert "оун-упа.укр" == url.host -def test_host_non_ascii_percent_encoded(): - url = URL("g%c3%bcnter.com") - assert url.host == "günter.com" - - def test_localhost(): url = URL("http://[::1]") assert "::1" == url.host From ca5942bfeb8f7b7dda3d7e45dc8adc8908dd7cc0 Mon Sep 17 00:00:00 2001 From: "J. Nick Koston" Date: Thu, 26 Sep 2024 16:23:50 -0500 Subject: [PATCH 19/19] more cover --- tests/test_url.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_url.py b/tests/test_url.py index 7d41007d3..5115c8749 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -189,6 +189,11 @@ def test_host_subcomponent(host: str): assert url.host_subcomponent == host +def test_host_subcomponent_return_idna_encoded_host(): + url = URL("http://оун-упа.укр") + assert url.host_subcomponent == "xn----8sb1bdhvc.xn--j1amh" + + def test_raw_host_non_ascii(): url = URL("http://оун-упа.укр") assert "xn----8sb1bdhvc.xn--j1amh" == url.raw_host