Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Significantly speed up parsing netloc in URL objects #1112

Merged
merged 33 commits into from
Sep 6, 2024
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions tests/test_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,13 @@ def test_scheme():
def test_raw_user():
url = URL("http://[email protected]")
assert "user" == url.raw_user
assert url.raw_user == url._val.username


def test_raw_user_non_ascii():
url = URL("http://бажан@example.com")
assert "%D0%B1%D0%B0%D0%B6%D0%B0%D0%BD" == url.raw_user
assert url.raw_user == url._val.username


def test_no_user():
Expand All @@ -134,11 +136,13 @@ def test_user_non_ascii():
def test_raw_password():
url = URL("http://user:[email protected]")
assert "password" == url.raw_password
assert url.raw_password == url._val.password


def test_raw_password_non_ascii():
url = URL("http://user:пароль@example.com")
assert "%D0%BF%D0%B0%D1%80%D0%BE%D0%BB%D1%8C" == url.raw_password
assert url.raw_password == url._val.password


def test_password_non_ascii():
Expand All @@ -152,6 +156,14 @@ def test_password_without_user():
assert "password" == url.password


def test_empty_password_without_user():
url = URL("http://:@example.com")
assert url.user is None
assert url.password == ""
assert url.raw_password == ""
assert url.raw_password == url._val.password


def test_user_empty_password():
url = URL("http://user:@example.com")
assert "user" == url.user
Expand All @@ -161,11 +173,13 @@ def test_user_empty_password():
def test_raw_host():
url = URL("http://example.com")
assert "example.com" == url.raw_host
assert url.raw_host == url._val.hostname


def test_raw_host_non_ascii():
url = URL("http://оун-упа.укр")
assert "xn----8sb1bdhvc.xn--j1amh" == url.raw_host
assert url.raw_host == url._val.hostname


def test_host_non_ascii():
Expand All @@ -186,16 +200,19 @@ def test_host_with_underscore():
def test_raw_host_when_port_is_specified():
url = URL("http://example.com:8888")
assert "example.com" == url.raw_host
assert url.raw_host == url._val.hostname


def test_raw_host_from_str_with_ipv4():
url = URL("http://127.0.0.1:80")
assert url.raw_host == "127.0.0.1"
assert url.raw_host == url._val.hostname


def test_raw_host_from_str_with_ipv6():
url = URL("http://[::1]:80")
assert url.raw_host == "::1"
assert url.raw_host == url._val.hostname


def test_authority_full() -> None:
Expand Down Expand Up @@ -229,35 +246,41 @@ def test_lowercase():
url = URL("http://gitHUB.com")
assert url.raw_host == "github.com"
assert url.host == url.raw_host
assert url.raw_host == url._val.hostname


def test_lowercase_nonascii():
url = URL("http://Слава.Укр")
assert url.raw_host == "xn--80aaf8a3a.xn--j1amh"
assert url.raw_host == url._val.hostname
assert url.host == "слава.укр"


def test_compressed_ipv6():
url = URL("http://[1DEC:0:0:0::1]")
assert url.raw_host == "1dec::1"
assert url.host == url.raw_host
assert url.raw_host == url._val.hostname


def test_ipv4_zone():
# I'm unsure if it is correct.
url = URL("http://1.2.3.4%тест%42:123")
assert url.raw_host == "1.2.3.4%тест%42"
assert url.host == url.raw_host
assert url.raw_host == url._val.hostname


def test_port_for_explicit_port():
url = URL("http://example.com:8888")
assert 8888 == url.port
assert url.explicit_port == url._val.port


def test_port_for_implicit_port():
url = URL("http://example.com")
assert 80 == url.port
assert url.explicit_port == url._val.port


def test_port_for_relative_url():
Expand All @@ -273,21 +296,25 @@ def test_port_for_unknown_scheme():
def test_explicit_port_for_explicit_port():
url = URL("http://example.com:8888")
assert 8888 == url.explicit_port
assert url.explicit_port == url._val.port


def test_explicit_port_for_implicit_port():
url = URL("http://example.com")
assert url.explicit_port is None
assert url.explicit_port == url._val.port


def test_explicit_port_for_relative_url():
url = URL("/path/to")
assert url.explicit_port is None
assert url.explicit_port == url._val.port


def test_explicit_port_for_unknown_scheme():
url = URL("unknown://example.com")
assert url.explicit_port is None
assert url.explicit_port == url._val.port


def test_raw_path_string_empty():
Expand Down Expand Up @@ -1903,3 +1930,40 @@ def test_join_encoded_url():
assert path_url.path == "/api/4"
new = original.join(path_url)
assert new.path == "/api/4"


# cache


def test_parsing_populates_cache():
"""Test that parsing a URL populates the cache."""
url = URL("http://user:[email protected]:80/path?a=b#frag")
assert url._cache["raw_user"] == "user"
assert url._cache["raw_password"] == "password"
assert url._cache["raw_host"] == "example.com"
assert url._cache["explicit_port"] == 80
assert url._cache["raw_query_string"] == "a=b"
assert url._cache["raw_fragment"] == "frag"
assert url._cache["scheme"] == "http"
assert url.raw_user == "user"
assert url.raw_password == "password"
assert url.raw_host == "example.com"
assert url.explicit_port == 80
assert url.raw_query_string == "a=b"
assert url.raw_fragment == "frag"
assert url.scheme == "http"
url._cache.clear()
assert url.raw_user == "user"
assert url.raw_password == "password"
assert url.raw_host == "example.com"
assert url.explicit_port == 80
assert url.raw_query_string == "a=b"
assert url.raw_fragment == "frag"
assert url.scheme == "http"
assert url._cache["raw_user"] == "user"
assert url._cache["raw_password"] == "password"
assert url._cache["raw_host"] == "example.com"
assert url._cache["explicit_port"] == 80
assert url._cache["raw_query_string"] == "a=b"
assert url._cache["raw_fragment"] == "frag"
assert url._cache["scheme"] == "http"
102 changes: 81 additions & 21 deletions yarl/_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
TYPE_CHECKING,
Any,
Callable,
Dict,
Iterable,
Iterator,
List,
Expand Down Expand Up @@ -210,38 +211,47 @@ def __new__(
else:
raise TypeError("Constructor parameter should be str")

cache: Dict[str, Union[str, int, None]] = {}
bdraco marked this conversation as resolved.
Show resolved Hide resolved
if not encoded:
host: Optional[str]
if not val[1]: # netloc
netloc = ""
host = ""
host = netloc = ""
else:
host = val.hostname
username, password, host, port = cls._split_netloc(val[1])
if host is None:
raise ValueError("Invalid URL: host is required for absolute urls")

try:
port = val.port
except ValueError as e:
raise ValueError(
"Invalid URL: port can't be converted to integer"
) from e

host = cls._encode_host(host)
raw_user = None if username is None else cls._REQUOTER(username)
raw_password = None if password is None else cls._REQUOTER(password)
bdraco marked this conversation as resolved.
Show resolved Hide resolved
netloc = cls._make_netloc(
val.username, val.password, host, port, encode=True, requote=True
raw_user, raw_password, host, port, encode_host=False
)
if "[" in host:
# Our host encoder adds back brackets for IPv6 addresses
# so we need to remove them here to get the raw host
_, _, bracketed = host.partition("[")
raw_host, _, _ = bracketed.partition("]")
else:
raw_host = host
cache["raw_host"] = raw_host
cache["raw_user"] = raw_user
cache["raw_password"] = raw_password
cache["explicit_port"] = port
path = cls._PATH_REQUOTER(val[2])
if netloc:
path = cls._normalize_path(path)

cls._validate_authority_uri_abs_path(host=host, path=path)
query = cls._QUERY_REQUOTER(val[3])
fragment = cls._FRAGMENT_REQUOTER(val[4])
cache["scheme"] = val[0]
cache["raw_query_string"] = query
cache["raw_fragment"] = fragment
val = SplitResult(val[0], netloc, path, query, fragment)

self = object.__new__(cls)
self._val = val
self._cache = {}
self._cache = cache
return self

@classmethod
Expand Down Expand Up @@ -413,6 +423,16 @@ def __setstate__(self, state):
self._val, *unused = state
self._cache = {}

def _cache_netloc(self) -> None:
"""Cache the netloc parts of the URL."""
cache = self._cache
(
cache["raw_user"],
cache["raw_password"],
cache["raw_host"],
cache["explicit_port"],
bdraco marked this conversation as resolved.
Show resolved Hide resolved
) = self._split_netloc(self._val.netloc)

def is_absolute(self) -> bool:
"""A check for absolute URLs.

Expand Down Expand Up @@ -528,15 +548,16 @@ def authority(self) -> str:
self.user, self.password, self.host, self.port, encode_host=False
)

@property
@cached_property
def raw_user(self) -> Optional[str]:
"""Encoded user part of URL.

None if user is missing.

"""
# not .username
return self._val.username or None
self._cache_netloc()
return self._cache["raw_user"]

@cached_property
def user(self) -> Optional[str]:
Expand All @@ -550,14 +571,15 @@ def user(self) -> Optional[str]:
return None
return self._UNQUOTER(raw_user)

@property
@cached_property
def raw_password(self) -> Optional[str]:
"""Encoded password part of URL.

None if password is missing.

"""
return self._val.password
self._cache_netloc()
return self._cache["raw_password"]

@cached_property
def password(self) -> Optional[str]:
Expand All @@ -580,7 +602,8 @@ def raw_host(self) -> Optional[str]:
"""
# Use host instead of hostname for sake of shortness
# May add .hostname prop later
return self._val.hostname
self._cache_netloc()
return self._cache["raw_host"]

@cached_property
def host(self) -> Optional[str]:
Expand Down Expand Up @@ -616,7 +639,8 @@ def explicit_port(self) -> Optional[int]:
None for relative URLs or URLs without explicit port.

"""
return self._val.port
self._cache_netloc()
return self._cache["explicit_port"]

@property
def raw_path(self) -> str:
Expand Down Expand Up @@ -650,7 +674,7 @@ def query(self) -> "MultiDictProxy[str]":
ret = MultiDict(parse_qsl(self.raw_query_string, keep_blank_values=True))
return MultiDictProxy(ret)

@property
@cached_property
def raw_query_string(self) -> str:
"""Encoded query part of URL.

Expand Down Expand Up @@ -682,7 +706,7 @@ def raw_path_qs(self) -> str:
return self.raw_path
return f"{self.raw_path}?{self.raw_query_string}"

@property
@cached_property
def raw_fragment(self) -> str:
"""Encoded fragment part of URL.

Expand Down Expand Up @@ -925,6 +949,42 @@ def _make_netloc(
ret = user + "@" + ret
return ret

@classmethod
@lru_cache # match the same size as urlsplit
def _split_netloc(
cls,
netloc: str,
) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[int]]:
"""Split netloc into username, password, host and port."""
if "@" not in netloc:
username: Optional[str] = None
password: Optional[str] = None
hostinfo = netloc
else:
userinfo, _, hostinfo = netloc.rpartition("@")
username, have_password, password = userinfo.partition(":")
if not have_password:
password = None

if "[" in hostinfo:
_, _, bracketed = hostinfo.partition("[")
hostname, _, port_str = bracketed.partition("]")
_, _, port_str = port_str.partition(":")
else:
hostname, _, port_str = hostinfo.partition(":")

if not port_str:
port: Optional[int] = None
bdraco marked this conversation as resolved.
Show resolved Hide resolved
else:
try:
port = int(port_str)
except ValueError:
raise ValueError("Invalid URL: port can't be converted to integer")
if not (0 <= port <= 65535):
raise ValueError("Port out of range 0-65535")

return username or None, password, hostname or None, port

def with_scheme(self, scheme: str) -> "URL":
"""Return a new URL with scheme replaced."""
# N.B. doesn't cleanup query/fragment
Expand Down
Loading