-
-
Notifications
You must be signed in to change notification settings - Fork 170
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Move URL parsing functions to their own module (#1360)
- Loading branch information
Showing
2 changed files
with
208 additions
and
205 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,188 @@ | ||
"""URL parsing utilities.""" | ||
|
||
import re | ||
import unicodedata | ||
from functools import lru_cache | ||
from typing import Union | ||
from urllib.parse import SplitResult, scheme_chars, uses_netloc | ||
|
||
from ._quoters import QUOTER | ||
|
||
# Leading and trailing C0 control and space to be stripped per WHATWG spec. | ||
# == "".join([chr(i) for i in range(0, 0x20 + 1)]) | ||
WHATWG_C0_CONTROL_OR_SPACE = ( | ||
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10" | ||
"\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f " | ||
) | ||
|
||
# Unsafe bytes to be removed per WHATWG spec | ||
UNSAFE_URL_BYTES_TO_REMOVE = ["\t", "\r", "\n"] | ||
USES_AUTHORITY = frozenset(uses_netloc) | ||
|
||
|
||
@lru_cache | ||
def split_url(url: str) -> SplitResult: | ||
"""Split URL into parts.""" | ||
# Adapted from urllib.parse.urlsplit | ||
# Only lstrip url as some applications rely on preserving trailing space. | ||
# (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both) | ||
url = url.lstrip(WHATWG_C0_CONTROL_OR_SPACE) | ||
for b in UNSAFE_URL_BYTES_TO_REMOVE: | ||
if b in url: | ||
url = url.replace(b, "") | ||
|
||
scheme = netloc = query = fragment = "" | ||
i = url.find(":") | ||
if i > 0 and url[0] in scheme_chars: | ||
for c in url[1:i]: | ||
if c not in scheme_chars: | ||
break | ||
else: | ||
scheme, url = url[:i].lower(), url[i + 1 :] | ||
has_hash = "#" in url | ||
has_question_mark = "?" in url | ||
if url[:2] == "//": | ||
delim = len(url) # position of end of domain part of url, default is end | ||
if has_hash and has_question_mark: | ||
delim_chars = "/?#" | ||
elif has_question_mark: | ||
delim_chars = "/?" | ||
elif has_hash: | ||
delim_chars = "/#" | ||
else: | ||
delim_chars = "/" | ||
for c in delim_chars: # look for delimiters; the order is NOT important | ||
wdelim = url.find(c, 2) # find first of this delim | ||
if wdelim >= 0 and wdelim < delim: # if found | ||
delim = wdelim # use earliest delim position | ||
netloc = url[2:delim] | ||
url = url[delim:] | ||
has_left_bracket = "[" in netloc | ||
has_right_bracket = "]" in netloc | ||
if (has_left_bracket and not has_right_bracket) or ( | ||
has_right_bracket and not has_left_bracket | ||
): | ||
raise ValueError("Invalid IPv6 URL") | ||
if has_left_bracket: | ||
bracketed_host = netloc.partition("[")[2].partition("]")[0] | ||
# Valid bracketed hosts are defined in | ||
# https://www.rfc-editor.org/rfc/rfc3986#page-49 | ||
# https://url.spec.whatwg.org/ | ||
if bracketed_host[0] == "v": | ||
if not re.match(r"\Av[a-fA-F0-9]+\..+\Z", bracketed_host): | ||
raise ValueError("IPvFuture address is invalid") | ||
elif ":" not in bracketed_host: | ||
raise ValueError("An IPv4 address cannot be in brackets") | ||
if has_hash: | ||
url, _, fragment = url.partition("#") | ||
if has_question_mark: | ||
url, _, query = url.partition("?") | ||
if netloc and not netloc.isascii(): | ||
_check_netloc(netloc) | ||
return tuple.__new__(SplitResult, (scheme, netloc, url, query, fragment)) | ||
|
||
|
||
def _check_netloc(netloc: str) -> None: | ||
# Adapted from urllib.parse._checknetloc | ||
# looking for characters like \u2100 that expand to 'a/c' | ||
# IDNA uses NFKC equivalence, so normalize for this check | ||
|
||
# ignore characters already included | ||
# but not the surrounding text | ||
n = netloc.replace("@", "").replace(":", "").replace("#", "").replace("?", "") | ||
normalized_netloc = unicodedata.normalize("NFKC", n) | ||
if n == normalized_netloc: | ||
return | ||
# Note that there are no unicode decompositions for the character '@' so | ||
# its currently impossible to have test coverage for this branch, however if the | ||
# one should be added in the future we want to make sure its still checked. | ||
for c in "/?#@:": # pragma: no branch | ||
if c in normalized_netloc: | ||
raise ValueError( | ||
f"netloc '{netloc}' contains invalid " | ||
"characters under NFKC normalization" | ||
) | ||
|
||
|
||
@lru_cache # match the same size as urlsplit | ||
def split_netloc( | ||
netloc: str, | ||
) -> tuple[Union[str, None], Union[str, None], Union[str, None], Union[int, None]]: | ||
"""Split netloc into username, password, host and port.""" | ||
if "@" not in netloc: | ||
username: Union[str, None] = None | ||
password: Union[str, None] = None | ||
hostinfo = netloc | ||
else: | ||
userinfo, _, hostinfo = netloc.rpartition("@") | ||
username, have_password, password = userinfo.partition(":") | ||
if not have_password: | ||
password = None | ||
|
||
if "[" in hostinfo: | ||
_, _, bracketed = hostinfo.partition("[") | ||
hostname, _, port_str = bracketed.partition("]") | ||
_, _, port_str = port_str.partition(":") | ||
else: | ||
hostname, _, port_str = hostinfo.partition(":") | ||
|
||
if not port_str: | ||
return username or None, password, hostname or None, None | ||
|
||
try: | ||
port = int(port_str) | ||
except ValueError: | ||
raise ValueError("Invalid URL: port can't be converted to integer") | ||
if not (0 <= port <= 65535): | ||
raise ValueError("Port out of range 0-65535") | ||
return username or None, password, hostname or None, port | ||
|
||
|
||
def unsplit_result( | ||
scheme: str, netloc: str, url: str, query: str, fragment: str | ||
) -> str: | ||
"""Unsplit a URL without any normalization.""" | ||
if netloc or (scheme and scheme in USES_AUTHORITY) or url[:2] == "//": | ||
if url and url[:1] != "/": | ||
url = f"//{netloc or ''}/{url}" | ||
else: | ||
url = f"//{netloc or ''}{url}" | ||
if scheme: | ||
url = f"{scheme}:{url}" | ||
if query: | ||
url = f"{url}?{query}" | ||
return f"{url}#{fragment}" if fragment else url | ||
|
||
|
||
@lru_cache # match the same size as urlsplit | ||
def make_netloc( | ||
user: Union[str, None], | ||
password: Union[str, None], | ||
host: Union[str, None], | ||
port: Union[int, None], | ||
encode: bool = False, | ||
) -> str: | ||
"""Make netloc from parts. | ||
The user and password are encoded if encode is True. | ||
The host must already be encoded with _encode_host. | ||
""" | ||
if host is None: | ||
return "" | ||
ret = host | ||
if port is not None: | ||
ret = f"{ret}:{port}" | ||
if user is None and password is None: | ||
return ret | ||
if password is not None: | ||
if not user: | ||
user = "" | ||
elif encode: | ||
user = QUOTER(user) | ||
if encode: | ||
password = QUOTER(password) | ||
user = f"{user}:{password}" | ||
elif user and encode: | ||
user = QUOTER(user) | ||
return f"{user}@{ret}" if user else ret |
Oops, something went wrong.