diff --git a/CHANGES/1348.breaking.rst b/CHANGES/1348.breaking.rst new file mode 100644 index 000000000..845e85800 --- /dev/null +++ b/CHANGES/1348.breaking.rst @@ -0,0 +1,3 @@ +Migrated to using a single cache for encoding hosts -- by :user:`bdraco`. + +Passing ``ip_address_size`` and ``host_validate_size`` to :py:meth:`~yarl.cache_configure` is deprecated in favor of the new ``encode_host_size`` parameter and will be removed in a future release. For backwards compatibility, the old parameters affect the ``encode_host`` cache size. diff --git a/docs/api.rst b/docs/api.rst index b7b80cf61..3c123739a 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -1037,22 +1037,20 @@ Default port substitution Cache control ------------- -IDNA conversion, host validation, and IP Address parsing used for host -encoding are quite expensive operations, that's why the ``yarl`` -library caches these calls by storing last ``256`` results in the +IDNA conversion and host encoding are quite expensive operations, +that's why the ``yarl`` library caches these calls by storing results in the global LRU cache. .. function:: cache_clear() - Clear IDNA, host validation, and IP Address caches. + Clear IDNA and host encoding cache. .. function:: cache_info() - Return a dictionary with ``"idna_encode"``, ``"idna_decode"``, ``"ip_address"``, - and ``"host_validate"`` keys, each value - points to corresponding ``CacheInfo`` structure (see :func:`functools.lru_cache` for - details): + Return a dictionary with ``"idna_encode"``, ``"idna_decode"``, and + ``"encode_host"`` keys, each value points to corresponding ``CacheInfo`` + structure (see :func:`functools.lru_cache` for details): .. doctest:: :options: +SKIP @@ -1060,20 +1058,27 @@ global LRU cache. >>> yarl.cache_info() {'idna_encode': CacheInfo(hits=5, misses=5, maxsize=256, currsize=5), 'idna_decode': CacheInfo(hits=24, misses=15, maxsize=256, currsize=15), - 'ip_address': CacheInfo(hits=46933, misses=84, maxsize=256, currsize=101), - 'host_validate': CacheInfo(hits=0, misses=0, maxsize=256, currsize=0)} + 'encode_host': CacheInfo(hits=0, misses=0, maxsize=512, currsize=0)} + .. versionchanged:: 1.16 + ``ip_address``, and ``host_validate`` are deprecated + in favor of a single ``encode_host`` cache. -.. function:: cache_configure(*, idna_encode_size=256, idna_decode_size=256, ip_address_size=256, host_validate_size=256) +.. function:: cache_configure(*, idna_encode_size=256, idna_decode_size=256, encode_host_size=512) - Set the IP Address, host validation, and IDNA encode and - decode cache sizes (``256`` for each by default). + Set the IDNA encode, IDNA decode, and host encode + cache sizes. Pass ``None`` to make the corresponding cache unbounded (may speed up host encoding operation a little but the memory footprint can be very high, please use with caution). + .. versionchanged:: 1.16 + + ``ip_address_size`` and ``host_validate_size`` + are deprecated in favor of a single ``encode_host`` cache. + References ---------- diff --git a/tests/test_cache.py b/tests/test_cache.py index 9b0e75a35..c79f554c8 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -1,3 +1,5 @@ +import pytest + import yarl # Don't check the actual behavior but make sure that calls are allowed @@ -13,7 +15,13 @@ def test_cache_clear() -> None: def test_cache_info() -> None: info = yarl.cache_info() - assert info.keys() == {"idna_encode", "idna_decode", "ip_address", "host_validate"} + assert info.keys() == { + "idna_encode", + "idna_decode", + "ip_address", + "host_validate", + "encode_host", + } def test_cache_configure_default() -> None: @@ -22,17 +30,36 @@ def test_cache_configure_default() -> None: def test_cache_configure_None() -> None: yarl.cache_configure( - idna_encode_size=None, idna_decode_size=None, - ip_address_size=None, - host_validate_size=None, + idna_encode_size=None, + encode_host_size=None, ) def test_cache_configure_explicit() -> None: yarl.cache_configure( - idna_encode_size=128, idna_decode_size=128, - ip_address_size=128, - host_validate_size=128, + idna_encode_size=128, + encode_host_size=128, ) + + +def test_cache_configure_waring() -> None: + msg = ( + r"cache_configure\(\) no longer accepts the ip_address_size " + r"or host_validate_size arguments, they are used to set the " + r"encode_host_size instead and will be removed in the future" + ) + with pytest.warns(DeprecationWarning, match=msg): + yarl.cache_configure( + idna_encode_size=1024, + idna_decode_size=1024, + ip_address_size=1024, + host_validate_size=1024, + ) + + assert yarl.cache_info()["encode_host"].maxsize == 1024 + with pytest.warns(DeprecationWarning, match=msg): + yarl.cache_configure(host_validate_size=None) + + assert yarl.cache_info()["encode_host"].maxsize is None diff --git a/yarl/__init__.py b/yarl/__init__.py index 7ca6ae2a2..44eeab3f2 100644 --- a/yarl/__init__.py +++ b/yarl/__init__.py @@ -8,7 +8,7 @@ cache_info, ) -__version__ = "1.15.6.dev0" +__version__ = "1.16.0.dev0" __all__ = ( "URL", diff --git a/yarl/_url.py b/yarl/_url.py index 57419043e..35e902673 100644 --- a/yarl/_url.py +++ b/yarl/_url.py @@ -87,6 +87,7 @@ class CacheInfo(TypedDict): idna_decode: _CacheInfo ip_address: _CacheInfo host_validate: _CacheInfo + encode_host: _CacheInfo class _InternalURLCache(TypedDict, total=False): @@ -239,78 +240,6 @@ def _check_netloc(netloc: str) -> None: ) -@lru_cache # match the same size as urlsplit -def _parse_host(host: str) -> tuple[bool, str, Union[bool, None], str, str, str]: - """Parse host into parts - - Returns a tuple of: - - True if the host looks like an IP address, False otherwise. - - Lowercased host - - True if the host is ASCII-only, False otherwise. - - Raw IP address - - Separator between IP address and zone - - Zone part of the IP address - """ - lower_host = host.lower() - is_ascii = host.isascii() - - # If the host ends with a digit or contains a colon, its likely - # an IP address. - if host and (host[-1].isdigit() or ":" in host): - if "%" in host: - return True, lower_host, is_ascii, *host.partition("%") - return True, lower_host, is_ascii, host, "", "" - - return False, lower_host, is_ascii, "", "", "" - - -def _encode_host(host: str, validate_host: bool) -> str: - """Encode host part of URL.""" - looks_like_ip, lower_host, is_ascii, raw_ip, sep, zone = _parse_host(host) - if looks_like_ip: - # If it looks like an IP, we check with _ip_compressed_version - # and fall-through if its not an IP address. This is a performance - # optimization to avoid parsing IP addresses as much as possible - # because it is orders of magnitude slower than almost any other - # operation this library does. - # Might be an IP address, check it - # - # IP Addresses can look like: - # https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.2 - # - 127.0.0.1 (last character is a digit) - # - 2001:db8::ff00:42:8329 (contains a colon) - # - 2001:db8::ff00:42:8329%eth0 (contains a colon) - # - [2001:db8::ff00:42:8329] (contains a colon -- brackets should - # have been removed before it gets here) - # Rare IP Address formats are not supported per: - # https://datatracker.ietf.org/doc/html/rfc3986#section-7.4 - # - # IP parsing is slow, so its wrapped in an LRU - try: - host, version = _ip_compressed_version(raw_ip) - except ValueError: - pass - else: - # These checks should not happen in the - # LRU to keep the cache size small - if version == 6: - return f"[{host}%{zone}]" if sep else f"[{host}]" - return f"{host}%{zone}" if sep else host - - # IDNA encoding is slow, - # skip it for ASCII-only strings - # Don't move the check into _idna_encode() helper - # to reduce the cache size - if is_ascii: - # Check for invalid characters explicitly; _idna_encode() does this - # for non-ascii host names. - if validate_host: - _host_validate(lower_host) - return lower_host - - return _idna_encode(lower_host) - - @lru_cache # match the same size as urlsplit def _split_netloc( netloc: str, @@ -1735,10 +1664,11 @@ def _human_quote(s: Union[str, None], unsafe: str) -> Union[str, None]: return "".join(c if c.isprintable() else quote(c) for c in s) -_MAXCACHE = 256 +_DEFAULT_IDNA_SIZE = 256 +_DEFAULT_ENCODE_SIZE = 512 -@lru_cache(_MAXCACHE) +@lru_cache(_DEFAULT_IDNA_SIZE) def _idna_decode(raw: str) -> str: try: return idna.decode(raw.encode("ascii")) @@ -1746,7 +1676,7 @@ def _idna_decode(raw: str) -> str: return raw.encode("ascii").decode("idna") -@lru_cache(_MAXCACHE) +@lru_cache(_DEFAULT_IDNA_SIZE) def _idna_encode(host: str) -> str: try: return idna.encode(host, uts46=True).decode("ascii") @@ -1754,38 +1684,69 @@ def _idna_encode(host: str) -> str: return host.encode("idna").decode("ascii") -@lru_cache(_MAXCACHE) -def _ip_compressed_version(raw_ip: str) -> tuple[str, int]: - """Return compressed version of IP address and its version.""" - ip = ip_address(raw_ip) - return ip.compressed, ip.version +@lru_cache(_DEFAULT_ENCODE_SIZE) +def _encode_host(host: str, validate_host: bool) -> str: + """Encode host part of URL.""" + # If the host ends with a digit or contains a colon, its likely + # an IP address. + if host and (host[-1].isdigit() or ":" in host): + raw_ip, sep, zone = host.partition("%") + # If it looks like an IP, we check with _ip_compressed_version + # and fall-through if its not an IP address. This is a performance + # optimization to avoid parsing IP addresses as much as possible + # because it is orders of magnitude slower than almost any other + # operation this library does. + # Might be an IP address, check it + # + # IP Addresses can look like: + # https://datatracker.ietf.org/doc/html/rfc3986#section-3.2.2 + # - 127.0.0.1 (last character is a digit) + # - 2001:db8::ff00:42:8329 (contains a colon) + # - 2001:db8::ff00:42:8329%eth0 (contains a colon) + # - [2001:db8::ff00:42:8329] (contains a colon -- brackets should + # have been removed before it gets here) + # Rare IP Address formats are not supported per: + # https://datatracker.ietf.org/doc/html/rfc3986#section-7.4 + # + # IP parsing is slow, so its wrapped in an LRU + try: + ip = ip_address(raw_ip) + except ValueError: + pass + else: + # These checks should not happen in the + # LRU to keep the cache size small + host = ip.compressed + if ip.version == 6: + return f"[{host}%{zone}]" if sep else f"[{host}]" + return f"{host}%{zone}" if sep else host + # IDNA encoding is slow, skip it for ASCII-only strings + if host.isascii(): + # Check for invalid characters explicitly; _idna_encode() does this + # for non-ascii host names. + if validate_host and (invalid := NOT_REG_NAME.search(host)): + value, pos, extra = invalid.group(), invalid.start(), "" + if value == "@" or (value == ":" and "@" in host[pos:]): + # this looks like an authority string + extra = ( + ", if the value includes a username or password, " + "use 'authority' instead of 'host'" + ) + raise ValueError( + f"Host {host!r} cannot contain {value!r} (at position {pos}){extra}" + ) from None + return host.lower() -@lru_cache(_MAXCACHE) -def _host_validate(host: str) -> None: - """Validate an ascii host name.""" - invalid = NOT_REG_NAME.search(host) - if invalid is None: - return - value, pos, extra = invalid.group(), invalid.start(), "" - if value == "@" or (value == ":" and "@" in host[pos:]): - # this looks like an authority string - extra = ( - ", if the value includes a username or password, " - "use 'authority' instead of 'host'" - ) - raise ValueError( - f"Host {host!r} cannot contain {value!r} (at position " f"{pos}){extra}" - ) from None + return _idna_encode(host) @rewrite_module def cache_clear() -> None: """Clear all LRU caches.""" - _idna_decode.cache_clear() _idna_encode.cache_clear() - _ip_compressed_version.cache_clear() - _host_validate.cache_clear() + _idna_decode.cache_clear() + _encode_host.cache_clear() @rewrite_module @@ -1794,25 +1755,49 @@ def cache_info() -> CacheInfo: return { "idna_encode": _idna_encode.cache_info(), "idna_decode": _idna_decode.cache_info(), - "ip_address": _ip_compressed_version.cache_info(), - "host_validate": _host_validate.cache_info(), + "ip_address": _encode_host.cache_info(), + "host_validate": _encode_host.cache_info(), + "encode_host": _encode_host.cache_info(), } +_SENTINEL = object() + + @rewrite_module def cache_configure( *, - idna_encode_size: Union[int, None] = _MAXCACHE, - idna_decode_size: Union[int, None] = _MAXCACHE, - ip_address_size: Union[int, None] = _MAXCACHE, - host_validate_size: Union[int, None] = _MAXCACHE, + idna_encode_size: Union[int, None, object] = _DEFAULT_IDNA_SIZE, + idna_decode_size: Union[int, None] = _DEFAULT_IDNA_SIZE, + ip_address_size: Union[int, None, object] = _SENTINEL, + host_validate_size: Union[int, None, object] = _SENTINEL, + encode_host_size: Union[int, None] = _DEFAULT_ENCODE_SIZE, ) -> None: """Configure LRU cache sizes.""" - global _idna_decode, _idna_encode, _ip_compressed_version, _host_validate - - _idna_encode = lru_cache(idna_encode_size)(_idna_encode.__wrapped__) + global _idna_decode, _idna_encode, _encode_host + # ip_address_size, host_validate_size are no longer + # used, but are kept for backwards compatibility. + if encode_host_size is not None: + for size in (ip_address_size, host_validate_size): + if size is not _SENTINEL: + warnings.warn( + "cache_configure() no longer accepts the " + "ip_address_size or host_validate_size arguments, " + "they are used to set the encode_host_size instead " + "and will be removed in the future", + DeprecationWarning, + stacklevel=2, + ) + if size is None: + encode_host_size = None + break + elif size is _SENTINEL: + size = _DEFAULT_ENCODE_SIZE + if TYPE_CHECKING: + assert isinstance(size, int) + if size > encode_host_size: + encode_host_size = size + + _encode_host = lru_cache(encode_host_size)(_encode_host.__wrapped__) _idna_decode = lru_cache(idna_decode_size)(_idna_decode.__wrapped__) - _ip_compressed_version = lru_cache(ip_address_size)( - _ip_compressed_version.__wrapped__ - ) - _host_validate = lru_cache(host_validate_size)(_host_validate.__wrapped__) + _idna_encode = lru_cache(idna_decode_size)(_idna_encode.__wrapped__)