Skip to content

Commit

Permalink
Normalize URL string ports per RFC 3986 section 3.2.3
Browse files Browse the repository at this point in the history
Previously, `yarl.URL` string representation would sometimes render
ports for well-known URI schemes. However, section 3.2.3 of RFC 3986
explicitly instructs us to omit it [[1]]. This patch does exactly that.

PR #1033

[1]: https://datatracker.ietf.org/doc/html/rfc3986.html#section-3.2.3

Co-authored-by: Sam Bull <[email protected]>
Co-authored-by: Sviatoslav Sydorenko <[email protected]>
  • Loading branch information
3 people authored Jul 17, 2024
1 parent 56a1d6b commit d5c5ab3
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 5 deletions.
6 changes: 6 additions & 0 deletions CHANGES/1033.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
The default protocol ports of well-known URI schemes are now taken into account
during the normalization of the URL string representation in accordance with
:rfc:`3986#section-3.2.3`.

Specified ports are removed from the :class:`str` representation of a :class:`~yarl.URL`
if the port matches the scheme's default port -- by :user:`commonism`.
7 changes: 7 additions & 0 deletions tests/test_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,12 @@ def test_authority_full_nonasci() -> None:
assert url.authority == "степан:пароль@слава.укр:8080"


def test_authority_unknown_scheme() -> None:
v = "scheme://user:[email protected]:43/path/to?a=1&b=2"
url = URL(v)
assert str(url) == v


def test_lowercase():
url = URL("http://gitHUB.com")
assert url.raw_host == "github.com"
Expand Down Expand Up @@ -1332,6 +1338,7 @@ def test_is_default_port_for_absolute_url_without_port():
def test_is_default_port_for_absolute_url_with_default_port():
url = URL("http://example.com:80")
assert url.is_default_port()
assert str(url) == "http://example.com"


def test_is_default_port_for_absolute_url_with_nondefault_port():
Expand Down
25 changes: 24 additions & 1 deletion tests/test_url_update_netloc.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,14 +191,37 @@ def test_with_port():
assert str(url.with_port(8888)) == "http://example.com:8888"


def test_with_default_port_normalization() -> None:
url = URL("http://example.com")
assert str(url.with_scheme("https")) == "https://example.com"
assert str(url.with_scheme("https").with_port(443)) == "https://example.com"
assert str(url.with_port(443).with_scheme("https")) == "https://example.com"


def test_with_custom_port_normalization() -> None:
url = URL("http://example.com")
u88 = url.with_port(88)
assert str(u88) == "http://example.com:88"
assert str(u88.with_port(80)) == "http://example.com"
assert str(u88.with_scheme("https")) == "https://example.com:88"


def test_with_explicit_port_normalization() -> None:
url = URL("http://example.com")
u80 = url.with_port(80)
assert str(u80) == "http://example.com"
assert str(u80.with_port(81)) == "http://example.com:81"
assert str(u80.with_scheme("https")) == "https://example.com:80"


def test_with_port_with_no_port():
url = URL("http://example.com")
assert str(url.with_port(None)) == "http://example.com"


def test_with_port_ipv6():
url = URL("http://[::1]:8080/")
assert str(url.with_port(80)) == "http://[::1]:80/"
assert str(url.with_port(81)) == "http://[::1]:81/"


def test_with_port_keeps_query_and_fragment():
Expand Down
43 changes: 39 additions & 4 deletions yarl/_url.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import functools
import math
import socket
import warnings
from collections.abc import Mapping, Sequence
from contextlib import suppress
from ipaddress import ip_address
from typing import Union
from urllib.parse import SplitResult, parse_qsl, quote, urljoin, urlsplit, urlunsplit

import idna
Expand Down Expand Up @@ -291,6 +293,18 @@ def __str__(self):
val = self._val
if not val.path and self.is_absolute() and (val.query or val.fragment):
val = val._replace(path="/")
if (port := self._get_port()) is None:
# port normalization - using None for default ports to remove from rendering
# https://datatracker.ietf.org/doc/html/rfc3986.html#section-6.2.3
val = val._replace(
netloc=self._make_netloc(
self.raw_user,
self.raw_password,
self.raw_host,
port,
encode_host=False,
)
)
return urlunsplit(val)

def __repr__(self):
Expand Down Expand Up @@ -382,10 +396,13 @@ def is_default_port(self):
e.g. 'http://python.org' or 'http://python.org:80', False
otherwise.
Return False for relative URLs.
"""
if self.port is None:
return False
default = DEFAULT_PORTS.get(self.scheme)
if self.explicit_port is None:
# A relative URL does not have an implicit port / default port
return self.port is not None
default = self._get_default_port()
if default is None:
return False
return self.port == default
Expand Down Expand Up @@ -435,6 +452,24 @@ def raw_authority(self):
"""
return self._val.netloc

def _get_default_port(self) -> Union[int, None]:
if not self.scheme:
return None

with suppress(KeyError):
return DEFAULT_PORTS[self.scheme]

with suppress(OSError):
return socket.getservbyname(self.scheme)

This comment has been minimized.

Copy link
@bdraco

bdraco Sep 1, 2024

Member

This does blocking I/O an is not async safe

open("/etc/services", O_RDONLY|O_LARGEFILE|O_CLOEXEC) = 3
fcntl(3, F_SETFD, FD_CLOEXEC)           = 0
read(3, "# Network services, Internet sty"..., 1024) = 1024
read(3, "nternet Gopher\nfinger\t\t79/tcp\nht"..., 1024) = 1024
read(3, "174/tcp\t\t\t# Mailer transport que"..., 1024) = 1024
read(3, "Transfer\nisakmp\t\t500/udp\t\t\t\t# IP"..., 1024) = 1024
read(3, "\ntinc\t\t655/tcp\t\t\t\t# tinc control"..., 1024) = 1024
read(3, "ks\t\t1080/tcp\t\t\t# socks proxy ser"..., 1024) = 1024
read(3, "-se\t2431/tcp\t\t\t# tcp side effect"..., 1024) = 1024
read(3, "\t4460/tcp\t# Network Time Securit"..., 1024) = 1024
read(3, "cd\t6445/tcp\tsge_execd\t# Grid Eng"..., 1024) = 1024
read(3, "50/tcp\t\t\t# Zabbix Agent\nzabbix-t"..., 1024) = 1024
read(3, "beros passwd server\nkrb-prop\t754"..., 1024) = 1024
read(3, " vty (zebra)\nospfd\t\t2604/tcp\t\t\t#"..., 1024) = 1024
read(3, "es\namidxtape\t10083/tcp\t\t\t# amand"..., 1024) = 525
read(3, "", 1024)                       = 0
close(3)                                = 0
write(1, "80\n", 380

return None

def _get_port(self) -> Union[int, None]:
"""Port or None if default port"""
if self._get_default_port() == self.port:
return None
return self.port

@cached_property
def authority(self):
"""Decoded authority part of URL.
Expand Down Expand Up @@ -522,7 +557,7 @@ def port(self):
scheme without default port substitution.
"""
return self._val.port or DEFAULT_PORTS.get(self._val.scheme)
return self._val.port or self._get_default_port()

@property
def explicit_port(self):
Expand Down

0 comments on commit d5c5ab3

Please sign in to comment.