diff --git a/CHANGES/1112.misc.rst b/CHANGES/1112.misc.rst new file mode 100644 index 000000000..f65ee80ae --- /dev/null +++ b/CHANGES/1112.misc.rst @@ -0,0 +1 @@ +Significantly improved performance of parsing the network location -- by :user:`bdraco`. diff --git a/tests/test_url.py b/tests/test_url.py index 831212878..4089b3664 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -114,11 +114,13 @@ def test_scheme(): def test_raw_user(): url = URL("http://user@example.com") assert "user" == url.raw_user + assert url.raw_user == url._val.username def test_raw_user_non_ascii(): url = URL("http://бажан@example.com") assert "%D0%B1%D0%B0%D0%B6%D0%B0%D0%BD" == url.raw_user + assert url.raw_user == url._val.username def test_no_user(): @@ -134,11 +136,13 @@ def test_user_non_ascii(): def test_raw_password(): url = URL("http://user:password@example.com") assert "password" == url.raw_password + assert url.raw_password == url._val.password def test_raw_password_non_ascii(): url = URL("http://user:пароль@example.com") assert "%D0%BF%D0%B0%D1%80%D0%BE%D0%BB%D1%8C" == url.raw_password + assert url.raw_password == url._val.password def test_password_non_ascii(): @@ -152,6 +156,14 @@ def test_password_without_user(): assert "password" == url.password +def test_empty_password_without_user(): + url = URL("http://:@example.com") + assert url.user is None + assert url.password == "" + assert url.raw_password == "" + assert url.raw_password == url._val.password + + def test_user_empty_password(): url = URL("http://user:@example.com") assert "user" == url.user @@ -161,11 +173,13 @@ def test_user_empty_password(): def test_raw_host(): url = URL("http://example.com") assert "example.com" == url.raw_host + assert url.raw_host == url._val.hostname def test_raw_host_non_ascii(): url = URL("http://оун-упа.укр") assert "xn----8sb1bdhvc.xn--j1amh" == url.raw_host + assert url.raw_host == url._val.hostname def test_host_non_ascii(): @@ -186,16 +200,19 @@ def test_host_with_underscore(): def test_raw_host_when_port_is_specified(): url = URL("http://example.com:8888") assert "example.com" == url.raw_host + assert url.raw_host == url._val.hostname def test_raw_host_from_str_with_ipv4(): url = URL("http://127.0.0.1:80") assert url.raw_host == "127.0.0.1" + assert url.raw_host == url._val.hostname def test_raw_host_from_str_with_ipv6(): url = URL("http://[::1]:80") assert url.raw_host == "::1" + assert url.raw_host == url._val.hostname def test_authority_full() -> None: @@ -229,11 +246,13 @@ def test_lowercase(): url = URL("http://gitHUB.com") assert url.raw_host == "github.com" assert url.host == url.raw_host + assert url.raw_host == url._val.hostname def test_lowercase_nonascii(): url = URL("http://Слава.Укр") assert url.raw_host == "xn--80aaf8a3a.xn--j1amh" + assert url.raw_host == url._val.hostname assert url.host == "слава.укр" @@ -241,6 +260,7 @@ def test_compressed_ipv6(): url = URL("http://[1DEC:0:0:0::1]") assert url.raw_host == "1dec::1" assert url.host == url.raw_host + assert url.raw_host == url._val.hostname def test_ipv4_zone(): @@ -248,16 +268,19 @@ def test_ipv4_zone(): url = URL("http://1.2.3.4%тест%42:123") assert url.raw_host == "1.2.3.4%тест%42" assert url.host == url.raw_host + assert url.raw_host == url._val.hostname def test_port_for_explicit_port(): url = URL("http://example.com:8888") assert 8888 == url.port + assert url.explicit_port == url._val.port def test_port_for_implicit_port(): url = URL("http://example.com") assert 80 == url.port + assert url.explicit_port == url._val.port def test_port_for_relative_url(): @@ -273,21 +296,25 @@ def test_port_for_unknown_scheme(): def test_explicit_port_for_explicit_port(): url = URL("http://example.com:8888") assert 8888 == url.explicit_port + assert url.explicit_port == url._val.port def test_explicit_port_for_implicit_port(): url = URL("http://example.com") assert url.explicit_port is None + assert url.explicit_port == url._val.port def test_explicit_port_for_relative_url(): url = URL("/path/to") assert url.explicit_port is None + assert url.explicit_port == url._val.port def test_explicit_port_for_unknown_scheme(): url = URL("unknown://example.com") assert url.explicit_port is None + assert url.explicit_port == url._val.port def test_raw_path_string_empty(): @@ -1903,3 +1930,40 @@ def test_join_encoded_url(): assert path_url.path == "/api/4" new = original.join(path_url) assert new.path == "/api/4" + + +# cache + + +def test_parsing_populates_cache(): + """Test that parsing a URL populates the cache.""" + url = URL("http://user:password@example.com:80/path?a=b#frag") + assert url._cache["raw_user"] == "user" + assert url._cache["raw_password"] == "password" + assert url._cache["raw_host"] == "example.com" + assert url._cache["explicit_port"] == 80 + assert url._cache["raw_query_string"] == "a=b" + assert url._cache["raw_fragment"] == "frag" + assert url._cache["scheme"] == "http" + assert url.raw_user == "user" + assert url.raw_password == "password" + assert url.raw_host == "example.com" + assert url.explicit_port == 80 + assert url.raw_query_string == "a=b" + assert url.raw_fragment == "frag" + assert url.scheme == "http" + url._cache.clear() + assert url.raw_user == "user" + assert url.raw_password == "password" + assert url.raw_host == "example.com" + assert url.explicit_port == 80 + assert url.raw_query_string == "a=b" + assert url.raw_fragment == "frag" + assert url.scheme == "http" + assert url._cache["raw_user"] == "user" + assert url._cache["raw_password"] == "password" + assert url._cache["raw_host"] == "example.com" + assert url._cache["explicit_port"] == 80 + assert url._cache["raw_query_string"] == "a=b" + assert url._cache["raw_fragment"] == "frag" + assert url._cache["scheme"] == "http" diff --git a/yarl/_url.py b/yarl/_url.py index 268ff23a5..ec92edb11 100644 --- a/yarl/_url.py +++ b/yarl/_url.py @@ -9,6 +9,7 @@ TYPE_CHECKING, Any, Callable, + Dict, Iterable, Iterator, List, @@ -209,38 +210,50 @@ def __new__( else: raise TypeError("Constructor parameter should be str") + cache: Dict[str, Union[str, int, None]] = {} if not encoded: host: Union[str, None] - if not val[1]: # netloc - netloc = "" + scheme, netloc, path, query, fragment = val + if not netloc: # netloc host = "" else: - host = val.hostname + username, password, host, port = cls._split_netloc(val[1]) if host is None: raise ValueError("Invalid URL: host is required for absolute urls") - - try: - port = val.port - except ValueError as e: - raise ValueError( - "Invalid URL: port can't be converted to integer" - ) from e - + host = cls._encode_host(host) + raw_user = None if username is None else cls._REQUOTER(username) + raw_password = None if password is None else cls._REQUOTER(password) netloc = cls._make_netloc( - val.username, val.password, host, port, encode=True, requote=True + raw_user, raw_password, host, port, encode_host=False ) - path = cls._PATH_REQUOTER(val[2]) - if netloc: - path = cls._normalize_path(path) + if "[" in host: + # Our host encoder adds back brackets for IPv6 addresses + # so we need to remove them here to get the raw host + _, _, bracketed = host.partition("[") + raw_host, _, _ = bracketed.partition("]") + else: + raw_host = host + cache["raw_host"] = raw_host + cache["raw_user"] = raw_user + cache["raw_password"] = raw_password + cache["explicit_port"] = port + + if path: + path = cls._PATH_REQUOTER(path) + if netloc: + path = cls._normalize_path(path) cls._validate_authority_uri_abs_path(host=host, path=path) - query = cls._QUERY_REQUOTER(val[3]) - fragment = cls._FRAGMENT_REQUOTER(val[4]) - val = SplitResult(val[0], netloc, path, query, fragment) + query = cls._QUERY_REQUOTER(query) if query else query + fragment = cls._FRAGMENT_REQUOTER(fragment) if fragment else fragment + cache["scheme"] = scheme + cache["raw_query_string"] = query + cache["raw_fragment"] = fragment + val = SplitResult(scheme, netloc, path, query, fragment) self = object.__new__(cls) self._val = val - self._cache = {} + self._cache = cache return self @classmethod @@ -412,6 +425,16 @@ def __setstate__(self, state): self._val, *unused = state self._cache = {} + def _cache_netloc(self) -> None: + """Cache the netloc parts of the URL.""" + cache = self._cache + ( + cache["raw_user"], + cache["raw_password"], + cache["raw_host"], + cache["explicit_port"], + ) = self._split_netloc(self._val.netloc) + def is_absolute(self) -> bool: """A check for absolute URLs. @@ -527,7 +550,7 @@ def authority(self) -> str: self.user, self.password, self.host, self.port, encode_host=False ) - @property + @cached_property def raw_user(self) -> Union[str, None]: """Encoded user part of URL. @@ -535,7 +558,8 @@ def raw_user(self) -> Union[str, None]: """ # not .username - return self._val.username or None + self._cache_netloc() + return self._cache["raw_user"] @cached_property def user(self) -> Union[str, None]: @@ -549,14 +573,15 @@ def user(self) -> Union[str, None]: return None return self._UNQUOTER(raw_user) - @property + @cached_property def raw_password(self) -> Union[str, None]: """Encoded password part of URL. None if password is missing. """ - return self._val.password + self._cache_netloc() + return self._cache["raw_password"] @cached_property def password(self) -> Union[str, None]: @@ -579,7 +604,8 @@ def raw_host(self) -> Union[str, None]: """ # Use host instead of hostname for sake of shortness # May add .hostname prop later - return self._val.hostname + self._cache_netloc() + return self._cache["raw_host"] @cached_property def host(self) -> Union[str, None]: @@ -615,7 +641,8 @@ def explicit_port(self) -> Union[int, None]: None for relative URLs or URLs without explicit port. """ - return self._val.port + self._cache_netloc() + return self._cache["explicit_port"] @property def raw_path(self) -> str: @@ -649,7 +676,7 @@ def query(self) -> "MultiDictProxy[str]": ret = MultiDict(parse_qsl(self.raw_query_string, keep_blank_values=True)) return MultiDictProxy(ret) - @property + @cached_property def raw_query_string(self) -> str: """Encoded query part of URL. @@ -681,7 +708,7 @@ def raw_path_qs(self) -> str: return self.raw_path return f"{self.raw_path}?{self.raw_query_string}" - @property + @cached_property def raw_fragment(self) -> str: """Encoded fragment part of URL. @@ -792,7 +819,7 @@ def _validate_authority_uri_abs_path(host: str, path: str) -> None: Raise ValueError if not. """ - if len(host) > 0 and len(path) > 0 and not path.startswith("/"): + if host and path and not path.startswith("/"): raise ValueError( "Path in a URL with authority should start with a slash ('/') if set" ) @@ -924,6 +951,42 @@ def _make_netloc( ret = user + "@" + ret return ret + @classmethod + @lru_cache # match the same size as urlsplit + def _split_netloc( + cls, + netloc: str, + ) -> Tuple[Union[str, None], Union[str, None], Union[str, None], Union[int, None]]: + """Split netloc into username, password, host and port.""" + if "@" not in netloc: + username: Union[str, None] = None + password: Union[str, None] = None + hostinfo = netloc + else: + userinfo, _, hostinfo = netloc.rpartition("@") + username, have_password, password = userinfo.partition(":") + if not have_password: + password = None + + if "[" in hostinfo: + _, _, bracketed = hostinfo.partition("[") + hostname, _, port_str = bracketed.partition("]") + _, _, port_str = port_str.partition(":") + else: + hostname, _, port_str = hostinfo.partition(":") + + if not port_str: + port: Union[int, None] = None + else: + try: + port = int(port_str) + except ValueError: + raise ValueError("Invalid URL: port can't be converted to integer") + if not (0 <= port <= 65535): + raise ValueError("Port out of range 0-65535") + + return username or None, password, hostname or None, port + def with_scheme(self, scheme: str) -> "URL": """Return a new URL with scheme replaced.""" # N.B. doesn't cleanup query/fragment