Skip to content

Commit

Permalink
Validate ASCII host values (#954)
Browse files Browse the repository at this point in the history
Co-authored-by: J. Nick Koston <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored Sep 26, 2024
1 parent 7128872 commit 3273936
Show file tree
Hide file tree
Showing 9 changed files with 130 additions and 16 deletions.
3 changes: 3 additions & 0 deletions CHANGES/880.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Started rejecting ASCII hostnames with invalid characters. For host strings that
look like authority strings, the exception message includes advice on what to do
instead -- by :user:`mjpieters`.
1 change: 1 addition & 0 deletions CHANGES/954.bugfix.rst
22 changes: 13 additions & 9 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1037,19 +1037,20 @@ Default port substitution
Cache control
-------------

IDNA conversion and IP Address parsing used for host encoding are quite expensive
operations, that's why the ``yarl`` library caches these calls by storing
last ``256`` results in the global LRU cache.
IDNA conversion, host validation, and IP Address parsing used for host
encoding are quite expensive operations, that's why the ``yarl``
library caches these calls by storing last ``256`` results in the
global LRU cache.

.. function:: cache_clear()

Clear IDNA and IP Address caches.
Clear IDNA, host validation, and IP Address caches.


.. function:: cache_info()

Return a dictionary with ``"idna_encode"``, ``"idna_decode"``, and
``"ip_address"`` keys, each value
Return a dictionary with ``"idna_encode"``, ``"idna_decode"``, ``"ip_address"``,
and ``"host_validate"`` keys, each value
points to corresponding ``CacheInfo`` structure (see :func:`functools.lru_cache` for
details):

Expand All @@ -1059,12 +1060,15 @@ last ``256`` results in the global LRU cache.
>>> yarl.cache_info()
{'idna_encode': CacheInfo(hits=5, misses=5, maxsize=256, currsize=5),
'idna_decode': CacheInfo(hits=24, misses=15, maxsize=256, currsize=15),
'ip_address': CacheInfo(hits=46933, misses=84, maxsize=256, currsize=101)}
'ip_address': CacheInfo(hits=46933, misses=84, maxsize=256, currsize=101),
'host_validate': CacheInfo(hits=0, misses=0, maxsize=256, currsize=0)}


.. function:: cache_configure(*, idna_encode_size=256, idna_decode_size=256, ip_address_size=256)

Set the IP Address and IDNA encode and decode cache sizes (``256`` for each by default).
.. function:: cache_configure(*, idna_encode_size=256, idna_decode_size=256, ip_address_size=256, host_validate_size=256)

Set the IP Address, host validation, and IDNA encode and
decode cache sizes (``256`` for each by default).

Pass ``None`` to make the corresponding cache unbounded (may speed up host encoding
operation a little but the memory footprint can be very high,
Expand Down
1 change: 1 addition & 0 deletions docs/spelling_wordlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ facto
glibc
google
hardcoded
hostnames
macOS
mailto
manylinux
Expand Down
12 changes: 9 additions & 3 deletions tests/test_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def test_cache_clear() -> None:

def test_cache_info() -> None:
info = yarl.cache_info()
assert info.keys() == {"idna_encode", "idna_decode", "ip_address"}
assert info.keys() == {"idna_encode", "idna_decode", "ip_address", "host_validate"}


def test_cache_configure_default() -> None:
Expand All @@ -22,11 +22,17 @@ def test_cache_configure_default() -> None:

def test_cache_configure_None() -> None:
yarl.cache_configure(
idna_encode_size=None, idna_decode_size=None, ip_address_size=None
idna_encode_size=None,
idna_decode_size=None,
ip_address_size=None,
host_validate_size=None,
)


def test_cache_configure_explicit() -> None:
yarl.cache_configure(
idna_encode_size=128, idna_decode_size=128, ip_address_size=128
idna_encode_size=128,
idna_decode_size=128,
ip_address_size=128,
host_validate_size=128,
)
11 changes: 11 additions & 0 deletions tests/test_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -2058,3 +2058,14 @@ def test_parsing_populates_cache():
assert url._cache["raw_query_string"] == "a=b"
assert url._cache["raw_fragment"] == "frag"
assert url._cache["scheme"] == "http"


@pytest.mark.parametrize(
("host", "is_authority"),
[
*(("other_gen_delim_" + c, False) for c in "[]"),
],
)
def test_build_with_invalid_ipv6_host(host: str, is_authority: bool):
with pytest.raises(ValueError, match="Invalid IPv6 URL"):
URL(f"http://{host}/")
19 changes: 19 additions & 0 deletions tests/test_url_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,25 @@ def test_build_with_authority_and_host():
URL.build(authority="host.com", host="example.com")


@pytest.mark.parametrize(
("host", "is_authority"),
[
("user:[email protected]", True),
("[email protected]", True),
("host:com", False),
("not_percent_encoded%Zf", False),
("still_not_percent_encoded%fZ", False),
*(("other_gen_delim_" + c, False) for c in "/?#[]"),
],
)
def test_build_with_invalid_host(host: str, is_authority: bool):
match = r"Host '[^']+' cannot contain '[^']+' \(at position \d+\)"
if is_authority:
match += ", if .* use 'authority' instead of 'host'"
with pytest.raises(ValueError, match=f"{match}$"):
URL.build(host=host)


def test_build_with_authority():
url = URL.build(scheme="http", authority="степан:[email protected]:8000", path="path")
assert (
Expand Down
20 changes: 20 additions & 0 deletions tests/test_url_update_netloc.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,26 @@ def test_with_host_non_ascii():
assert url2.authority == "оун-упа.укр:123"


@pytest.mark.parametrize(
("host", "is_authority"),
[
("user:[email protected]", True),
("[email protected]", True),
("host:com", False),
("not_percent_encoded%Zf", False),
("still_not_percent_encoded%fZ", False),
*(("other_gen_delim_" + c, False) for c in "/?#[]"),
],
)
def test_with_invalid_host(host: str, is_authority: bool):
url = URL("http://example.com:123")
match = r"Host '[^']+' cannot contain '[^']+' \(at position \d+\)"
if is_authority:
match += ", if .* use 'authority' instead of 'host'"
with pytest.raises(ValueError, match=f"{match}$"):
url.with_host(host=host)


def test_with_host_percent_encoded():
url = URL("http://%25cf%2580%cf%80:%25cf%2580%cf%[email protected]:123")
url2 = url.with_host("%cf%80.org")
Expand Down
57 changes: 53 additions & 4 deletions yarl/_url.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import math
import re
import sys
import warnings
from collections.abc import Mapping, Sequence
Expand Down Expand Up @@ -45,6 +46,22 @@

sentinel = object()

# reg-name: unreserved / pct-encoded / sub-delims
# this pattern matches anything that is *not* in those classes. and is only used
# on lower-cased ASCII values.
_not_reg_name = re.compile(
r"""
# any character not in the unreserved or sub-delims sets, plus %
# (validated with the additional check for pct-encoded sequences below)
[^a-z0-9\-._~!$&'()*+,;=%]
|
# % only allowed if it is part of a pct-encoded
# sequence of 2 hex digits.
%(?![0-9a-f]{2})
""",
re.VERBOSE,
)

SimpleQuery = Union[str, int, float]
QueryVariable = Union[SimpleQuery, "Sequence[SimpleQuery]"]
Query = Union[
Expand All @@ -64,6 +81,7 @@ class CacheInfo(TypedDict):
idna_encode: _CacheInfo
idna_decode: _CacheInfo
ip_address: _CacheInfo
host_validate: _CacheInfo


class _SplitResultDict(TypedDict, total=False):
Expand Down Expand Up @@ -269,7 +287,7 @@ def __new__(
raise ValueError(msg)
else:
host = ""
host = cls._encode_host(host)
host = cls._encode_host(host, validate_host=False)
raw_user = None if username is None else cls._REQUOTER(username)
raw_password = None if password is None else cls._REQUOTER(password)
netloc = cls._make_netloc(
Expand Down Expand Up @@ -959,7 +977,9 @@ def _normalize_path(cls, path: str) -> str:
return prefix + "/".join(_normalize_path_segments(segments))

@classmethod
def _encode_host(cls, host: str, human: bool = False) -> str:
def _encode_host(
cls, host: str, human: bool = False, validate_host: bool = True
) -> str:
if "%" in host:
raw_ip, sep, zone = host.partition("%")
else:
Expand Down Expand Up @@ -999,11 +1019,18 @@ def _encode_host(cls, host: str, human: bool = False) -> str:
return host

host = host.lower()
if human:
return host

# IDNA encoding is slow,
# skip it for ASCII-only strings
# Don't move the check into _idna_encode() helper
# to reduce the cache size
if human or host.isascii():
if host.isascii():
# Check for invalid characters explicitly; _idna_encode() does this
# for non-ascii host names.
if validate_host:
_host_validate(host)
return host
return _idna_encode(host)

Expand Down Expand Up @@ -1559,12 +1586,31 @@ def _ip_compressed_version(raw_ip: str) -> Tuple[str, int]:
return ip.compressed, ip.version


@lru_cache(_MAXCACHE)
def _host_validate(host: str) -> None:
"""Validate an ascii host name."""
invalid = _not_reg_name.search(host)
if invalid is None:
return
value, pos, extra = invalid.group(), invalid.start(), ""
if value == "@" or (value == ":" and "@" in host[pos:]):
# this looks like an authority string
extra = (
", if the value includes a username or password, "
"use 'authority' instead of 'host'"
)
raise ValueError(
f"Host {host!r} cannot contain {value!r} (at position " f"{pos}){extra}"
) from None


@rewrite_module
def cache_clear() -> None:
"""Clear all LRU caches."""
_idna_decode.cache_clear()
_idna_encode.cache_clear()
_ip_compressed_version.cache_clear()
_host_validate.cache_clear()


@rewrite_module
Expand All @@ -1574,6 +1620,7 @@ def cache_info() -> CacheInfo:
"idna_encode": _idna_encode.cache_info(),
"idna_decode": _idna_decode.cache_info(),
"ip_address": _ip_compressed_version.cache_info(),
"host_validate": _host_validate.cache_info(),
}


Expand All @@ -1583,12 +1630,14 @@ def cache_configure(
idna_encode_size: Union[int, None] = _MAXCACHE,
idna_decode_size: Union[int, None] = _MAXCACHE,
ip_address_size: Union[int, None] = _MAXCACHE,
host_validate_size: Union[int, None] = _MAXCACHE,
) -> None:
"""Configure LRU cache sizes."""
global _idna_decode, _idna_encode, _ip_compressed_version
global _idna_decode, _idna_encode, _ip_compressed_version, _host_validate

_idna_encode = lru_cache(idna_encode_size)(_idna_encode.__wrapped__)
_idna_decode = lru_cache(idna_decode_size)(_idna_decode.__wrapped__)
_ip_compressed_version = lru_cache(ip_address_size)(
_ip_compressed_version.__wrapped__
)
_host_validate = lru_cache(host_validate_size)(_host_validate.__wrapped__)

0 comments on commit 3273936

Please sign in to comment.