aio-libs · bdraco · Aug 31, 2024 · Aug 31, 2024 · Aug 31, 2024 · Aug 31, 2024
diff --git a/CHANGES/1067.bugfix.rst b/CHANGES/1067.bugfix.rst
@@ -0,0 +1,3 @@
+Reverted RFC3986 compatible URL.join honoring empty segments which was introduced in :issue:`1039`.
+
+This change introduced a regression handling query string parameters with joined URLs. The change was reverted to maintain compatibility with the previous behavior.
diff --git a/tests/test_url.py b/tests/test_url.py
@@ -1664,69 +1664,6 @@ def test_join_from_rfc_3986_abnormal(url, expected):
     assert base.join(url) == expected
 
 
-EMPTY_SEGMENTS = [
-    (
-        "https://web.archive.org/web/",
-        "./https://github.com/aio-libs/yarl",
-        "https://web.archive.org/web/https://github.com/aio-libs/yarl",
-    ),
-    (
-        "https://web.archive.org/web/https://github.com/",
-        "aio-libs/yarl",
-        "https://web.archive.org/web/https://github.com/aio-libs/yarl",
-    ),
-]
-
-
-@pytest.mark.parametrize("base,url,expected", EMPTY_SEGMENTS)
-def test_join_empty_segments(base, url, expected):
-    base = URL(base)
-    url = URL(url)
-    expected = URL(expected)
-    joined = base.join(url)
-    assert joined == expected
-
-
-SIMPLE_BASE = "http://a/b/c/d"
-URLLIB_URLJOIN = [
-    ("", "http://a/b/c/g?y/./x", "http://a/b/c/g?y/./x"),
-    ("", "http://a/./g", "http://a/./g"),
-    ("svn://pathtorepo/dir1", "dir2", "svn://pathtorepo/dir2"),
-    ("svn+ssh://pathtorepo/dir1", "dir2", "svn+ssh://pathtorepo/dir2"),
-    ("ws://a/b", "g", "ws://a/g"),
-    ("wss://a/b", "g", "wss://a/g"),
-    # test for issue22118 duplicate slashes
-    (SIMPLE_BASE + "/", "foo", SIMPLE_BASE + "/foo"),
-    # Non-RFC-defined tests, covering variations of base and trailing
-    # slashes
-    ("http://a/b/c/d/e/", "../../f/g/", "http://a/b/c/f/g/"),
-    ("http://a/b/c/d/e", "../../f/g/", "http://a/b/f/g/"),
-    ("http://a/b/c/d/e/", "/../../f/g/", "http://a/f/g/"),
-    ("http://a/b/c/d/e", "/../../f/g/", "http://a/f/g/"),
-    ("http://a/b/c/d/e/", "../../f/g", "http://a/b/c/f/g"),
-    ("http://a/b/", "../../f/g/", "http://a/f/g/"),
-    ("a", "b", "b"),
-]
-
-
-@pytest.mark.parametrize("base,url,expected", URLLIB_URLJOIN)
-def test_join_cpython_urljoin(base, url, expected):
-    # tests from cpython urljoin
-    base = URL(base)
-    url = URL(url)
-    expected = URL(expected)
-    joined = base.join(url)
-    assert joined == expected
-
-
-def test_join_cpython_urljoin_fail():
-    with pytest.raises(
-        TypeError, match=r"unsupported operand type\(s\) for \+: 'NoneType' and 'str'"
-    ):
-        URL("http:///").join(URL(".."))
-    pytest.xfail("Shouldn't raise TypeError on empty host name")
-
-
 def test_split_result_non_decoded():
     with pytest.raises(ValueError):
         URL(SplitResult("http", "example.com", "path", "qs", "frag"))

diff --git a/yarl/_url.py b/yarl/_url.py
@@ -6,9 +6,7 @@
 from contextlib import suppress
 from ipaddress import ip_address
 from typing import Union
-from urllib.parse import SplitResult, parse_qsl, quote, urlsplit, urlunsplit
-from urllib.parse import uses_netloc as uses_authority
-from urllib.parse import uses_relative
+from urllib.parse import SplitResult, parse_qsl, quote, urljoin, urlsplit, urlunsplit
 
 import idna
 from multidict import MultiDict, MultiDictProxy
@@ -261,14 +259,12 @@
                 netloc = authority
             else:
                 tmp = SplitResult("", authority, "", "", "")
-                port = None if tmp.port == cls._default_port(scheme) else tmp.port
                 netloc = cls._make_netloc(
-                    tmp.username, tmp.password, tmp.hostname, port, encode=True
+                    tmp.username, tmp.password, tmp.hostname, tmp.port, encode=True
                 )
         elif not user and not password and not host and not port:
             netloc = ""
         else:
-            port = None if port == cls._default_port(scheme) else port
             netloc = cls._make_netloc(
                 user, password, host, port, encode=not encoded, encode_host=not encoded
             )
@@ -459,15 +455,12 @@
     def _get_default_port(self) -> Union[int, None]:
         if not self.scheme:
             return None
-        return self._default_port(self.scheme)
 
-    @staticmethod
-    def _default_port(scheme: str) -> Union[int, None]:
         with suppress(KeyError):
-            return DEFAULT_PORTS[scheme]
+            return DEFAULT_PORTS[self.scheme]
 
         with suppress(OSError):
-            return socket.getservbyname(scheme)
+            return socket.getservbyname(self.scheme)
 
         return None
 
@@ -769,7 +762,11 @@
                     f"Appending path {path!r} starting from slash is forbidden"
                 )
             path = path if encoded else self._PATH_QUOTER(path)
-            segments = list(reversed(path.split("/")))
+            segments = [
+                segment for segment in reversed(path.split("/")) if segment != "."
+            ]
+            if not segments:
+                continue
             # remove trailing empty segment for all but the last path
             segment_slice_start = int(not last and segments[0] == "")
             parsed += segments[segment_slice_start:]
@@ -1156,52 +1153,7 @@
         # See docs for urllib.parse.urljoin
         if not isinstance(url, URL):
             raise TypeError("url should be URL")
-        other: URL = url
-        scheme = other.scheme or self.scheme
-        parts = {
-            k: getattr(self, k) or ""
-            for k in ("authority", "path", "query_string", "fragment")
-        }
-        parts["scheme"] = scheme
-
-        if scheme != self.scheme or scheme not in uses_relative:
-            return URL(str(other))
-
-        # scheme is in uses_authority as uses_authority is a superset of uses_relative
-        if scheme in uses_authority and other.authority:
-            parts.update(
-                {
-                    k: getattr(other, k) or ""
-                    for k in ("authority", "path", "query_string", "fragment")
-                }
-            )
-            return URL.build(**parts)
-
-        if other.path or other.fragment:
-            parts["fragment"] = other.fragment or ""
-        if other.path or other.query:
-            parts["query_string"] = other.query_string or ""
-
-        if not other.path:
-            return URL.build(**parts)
-
-        if other.path[0] == "/":
-            parts["path"] = other.path
-            return URL.build(**parts)
-
-        if self.path[-1] == "/":
-            # using an intermediate to avoid URL.joinpath dropping query & fragment
-            parts["path"] = URL(self.path).joinpath(other.path).path
-        else:
-            # …
-            # and relativizing ".."
-            path = URL("/".join([*self.parts[1:-1], ""])).joinpath(other.path).path
-            if parts["authority"]:
-                parts["path"] = "/" + path
-            else:
-                parts["path"] = path
-
-        return URL.build(**parts)
+        return URL(urljoin(str(self), str(url)), encoded=True)
 
     def joinpath(self, *other, encoded=False):
         """Return a new URL with the elements in other appended to the path."""
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		Reverted RFC3986 compatible URL.join honoring empty segments which was introduced in :issue:`1039`.
Copy link Member webknjaz Aug 31, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. @bdraco FYI, Sphinx has a dedicated role `:rfc:` which should be used to link RFCs. The rendered style would also be more consistent. Copy link Member Author bdraco Sep 1, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. I did discover that, and I changed it before I published it in the release step. At least I hope I did as my burnout level was pretty high at that point

		This change introduced a regression handling query string parameters with joined URLs. The change was reverted to maintain compatibility with the previous behavior.