Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-82150: Make urllib.parse.urlsplit and urllib.parse.urlunsplit preserve the '?' and '#' delimiters of empty query and fragment components #15642

Closed
wants to merge 19 commits into from
Closed
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 2 additions & 7 deletions Doc/library/urllib.parse.rst
Original file line number Diff line number Diff line change
Expand Up @@ -239,10 +239,7 @@ or on combining URL components into a URL string.
.. function:: urlunparse(parts)

Construct a URL from a tuple as returned by ``urlparse()``. The *parts*
argument can be any six-item iterable. This may result in a slightly
different, but equivalent URL, if the URL that was parsed originally had
unnecessary delimiters (for example, a ``?`` with an empty query; the RFC
states that these are equivalent).
argument can be any six-item iterable.


.. function:: urlsplit(urlstring, scheme='', allow_fragments=True)
Expand Down Expand Up @@ -307,9 +304,7 @@ or on combining URL components into a URL string.

Combine the elements of a tuple as returned by :func:`urlsplit` into a
complete URL as a string. The *parts* argument can be any five-item
iterable. This may result in a slightly different, but equivalent URL, if the
URL that was parsed originally had unnecessary delimiters (for example, a ?
with an empty query; the RFC states that these are equivalent).
iterable.


.. function:: urljoin(base, url, allow_fragments=True)
Expand Down
4 changes: 1 addition & 3 deletions Lib/test/test_urllib2.py
Original file line number Diff line number Diff line change
Expand Up @@ -1080,9 +1080,7 @@ def test_full_url_setter(self):
parsed = urlparse(url)

self.assertEqual(r.get_full_url(), url)
# full_url setter uses splittag to split into components.
# splittag sets the fragment as None while urlparse sets it to ''
self.assertEqual(r.fragment or '', parsed.fragment)
self.assertEqual(r.fragment, parsed.fragment)
self.assertEqual(urlparse(r.get_full_url()).query, parsed.query)

def test_full_url_deleter(self):
Expand Down
123 changes: 66 additions & 57 deletions Lib/test/test_urlparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,34 +155,34 @@ def test_qs(self):
def test_roundtrips(self):
str_cases = [
('file:///tmp/junk.txt',
('file', '', '/tmp/junk.txt', '', '', ''),
('file', '', '/tmp/junk.txt', '', '')),
('file', '', '/tmp/junk.txt', '', None, None),
('file', '', '/tmp/junk.txt', None, None)),
('imap://mail.python.org/mbox1',
('imap', 'mail.python.org', '/mbox1', '', '', ''),
('imap', 'mail.python.org', '/mbox1', '', '')),
('imap', 'mail.python.org', '/mbox1', '', None, None),
('imap', 'mail.python.org', '/mbox1', None, None)),
('mms://wms.sys.hinet.net/cts/Drama/09006251100.asf',
('mms', 'wms.sys.hinet.net', '/cts/Drama/09006251100.asf',
'', '', ''),
'', None, None),
('mms', 'wms.sys.hinet.net', '/cts/Drama/09006251100.asf',
'', '')),
None, None)),
('nfs://server/path/to/file.txt',
('nfs', 'server', '/path/to/file.txt', '', '', ''),
('nfs', 'server', '/path/to/file.txt', '', '')),
('nfs', 'server', '/path/to/file.txt', '', None, None),
('nfs', 'server', '/path/to/file.txt', None, None)),
('svn+ssh://svn.zope.org/repos/main/ZConfig/trunk/',
('svn+ssh', 'svn.zope.org', '/repos/main/ZConfig/trunk/',
'', '', ''),
'', None, None),
('svn+ssh', 'svn.zope.org', '/repos/main/ZConfig/trunk/',
'', '')),
None, None)),
('git+ssh://[email protected]/user/project.git',
('git+ssh', '[email protected]','/user/project.git',
'','',''),
'', None, None),
('git+ssh', '[email protected]','/user/project.git',
'', '')),
None, None)),
]
def _encode(t):
return (t[0].encode('ascii'),
tuple(x.encode('ascii') for x in t[1]),
tuple(x.encode('ascii') for x in t[2]))
tuple(x.encode('ascii') if x is not None else None for x in t[1]),
tuple(x.encode('ascii') if x is not None else None for x in t[2]))
bytes_cases = [_encode(x) for x in str_cases]
for url, parsed, split in str_cases + bytes_cases:
self.checkRoundtrips(url, parsed, split)
Expand All @@ -193,25 +193,34 @@ def test_http_roundtrips(self):
# Three cheers for white box knowledge!
str_cases = [
('://www.python.org',
('www.python.org', '', '', '', ''),
('www.python.org', '', '', '')),
('www.python.org', '', '', None, None),
('www.python.org', '', None, None)),
('://www.python.org#abc',
('www.python.org', '', '', '', 'abc'),
('www.python.org', '', '', 'abc')),
('www.python.org', '', '', None, 'abc'),
('www.python.org', '', None, 'abc')),
('://www.python.org?q=abc',
('www.python.org', '', '', 'q=abc', ''),
('www.python.org', '', 'q=abc', '')),
('www.python.org', '', '', 'q=abc', None),
('www.python.org', '', 'q=abc', None)),
('://www.python.org/#abc',
('www.python.org', '/', '', '', 'abc'),
('www.python.org', '/', '', 'abc')),
('www.python.org', '/', '', None, 'abc'),
('www.python.org', '/', None, 'abc')),
('://a/b/c/d;p?q#f',
('a', '/b/c/d', 'p', 'q', 'f'),
('a', '/b/c/d;p', 'q', 'f')),
('://a/?',
('a', '/', '', '', None),
('a', '/', '', None)),
('://a/#',
('a', '/', '', None, ''),
('a', '/', None, '')),
('://a/?#',
('a', '/', '', '', ''),
('a', '/', '', '')),
]
def _encode(t):
return (t[0].encode('ascii'),
tuple(x.encode('ascii') for x in t[1]),
tuple(x.encode('ascii') for x in t[2]))
tuple(x.encode('ascii') if x is not None else None for x in t[1]),
tuple(x.encode('ascii') if x is not None else None for x in t[2]))
bytes_cases = [_encode(x) for x in str_cases]
str_schemes = ('http', 'https')
bytes_schemes = (b'http', b'https')
Expand Down Expand Up @@ -290,7 +299,7 @@ def test_RFC1808(self):
def test_RFC2368(self):
# Issue 11467: path that starts with a number is not parsed correctly
self.assertEqual(urllib.parse.urlparse('mailto:[email protected]'),
('mailto', '', '[email protected]', '', '', ''))
('mailto', '', '[email protected]', '', None, None))

def test_RFC2396(self):
# cases from RFC 2396
Expand Down Expand Up @@ -502,18 +511,18 @@ def _encode(t):
def test_urldefrag(self):
str_cases = [
('http://python.org#frag', 'http://python.org', 'frag'),
('http://python.org', 'http://python.org', ''),
('http://python.org', 'http://python.org', None),
('http://python.org/#frag', 'http://python.org/', 'frag'),
('http://python.org/', 'http://python.org/', ''),
('http://python.org/', 'http://python.org/', None),
('http://python.org/?q#frag', 'http://python.org/?q', 'frag'),
('http://python.org/?q', 'http://python.org/?q', ''),
('http://python.org/?q', 'http://python.org/?q', None),
('http://python.org/p#frag', 'http://python.org/p', 'frag'),
('http://python.org/p?q', 'http://python.org/p?q', ''),
('http://python.org/p?q', 'http://python.org/p?q', None),
(RFC1808_BASE, 'http://a/b/c/d;p?q', 'f'),
(RFC2396_BASE, 'http://a/b/c/d;p?q', ''),
(RFC2396_BASE, 'http://a/b/c/d;p?q', None),
]
def _encode(t):
return type(t)(x.encode('ascii') for x in t)
return type(t)(x.encode('ascii') if x is not None else None for x in t)
bytes_cases = [_encode(x) for x in str_cases]
for url, defrag, frag in str_cases + bytes_cases:
result = urllib.parse.urldefrag(url)
Expand All @@ -537,7 +546,7 @@ def test_urlsplit_attributes(self):
self.assertEqual(p.scheme, "http")
self.assertEqual(p.netloc, "WWW.PYTHON.ORG")
self.assertEqual(p.path, "/doc/")
self.assertEqual(p.query, "")
self.assertEqual(p.query, None)
self.assertEqual(p.fragment, "frag")
self.assertEqual(p.username, None)
self.assertEqual(p.password, None)
Expand Down Expand Up @@ -584,7 +593,7 @@ def test_urlsplit_attributes(self):
self.assertEqual(p.scheme, b"http")
self.assertEqual(p.netloc, b"WWW.PYTHON.ORG")
self.assertEqual(p.path, b"/doc/")
self.assertEqual(p.query, b"")
self.assertEqual(p.query, None)
self.assertEqual(p.fragment, b"frag")
self.assertEqual(p.username, None)
self.assertEqual(p.password, None)
Expand Down Expand Up @@ -684,44 +693,44 @@ def test_attributes_without_netloc(self):
def test_noslash(self):
# Issue 1637: http://foo.com?query is legal
self.assertEqual(urllib.parse.urlparse("http://example.com?blahblah=/foo"),
('http', 'example.com', '', '', 'blahblah=/foo', ''))
('http', 'example.com', '', '', 'blahblah=/foo', None))
self.assertEqual(urllib.parse.urlparse(b"http://example.com?blahblah=/foo"),
(b'http', b'example.com', b'', b'', b'blahblah=/foo', b''))
(b'http', b'example.com', b'', b'', b'blahblah=/foo', None))

def test_withoutscheme(self):
# Test urlparse without scheme
# Issue 754016: urlparse goes wrong with IP:port without scheme
# RFC 1808 specifies that netloc should start with //, urlparse expects
# the same, otherwise it classifies the portion of url as path.
self.assertEqual(urllib.parse.urlparse("path"),
('','','path','','',''))
('','','path','',None,None))
self.assertEqual(urllib.parse.urlparse("//www.python.org:80"),
('','www.python.org:80','','','',''))
('','www.python.org:80','','',None,None))
self.assertEqual(urllib.parse.urlparse("http://www.python.org:80"),
('http','www.python.org:80','','','',''))
('http','www.python.org:80','','',None,None))
# Repeat for bytes input
self.assertEqual(urllib.parse.urlparse(b"path"),
(b'',b'',b'path',b'',b'',b''))
(b'',b'',b'path',b'',None,None))
self.assertEqual(urllib.parse.urlparse(b"//www.python.org:80"),
(b'',b'www.python.org:80',b'',b'',b'',b''))
(b'',b'www.python.org:80',b'',b'',None,None))
self.assertEqual(urllib.parse.urlparse(b"http://www.python.org:80"),
(b'http',b'www.python.org:80',b'',b'',b'',b''))
(b'http',b'www.python.org:80',b'',b'',None,None))

def test_portseparator(self):
# Issue 754016 makes changes for port separator ':' from scheme separator
self.assertEqual(urllib.parse.urlparse("path:80"),
('','','path:80','','',''))
self.assertEqual(urllib.parse.urlparse("http:"),('http','','','','',''))
self.assertEqual(urllib.parse.urlparse("https:"),('https','','','','',''))
('','','path:80','',None,None))
self.assertEqual(urllib.parse.urlparse("http:"),('http','','','',None,None))
self.assertEqual(urllib.parse.urlparse("https:"),('https','','','',None,None))
self.assertEqual(urllib.parse.urlparse("http://www.python.org:80"),
('http','www.python.org:80','','','',''))
('http','www.python.org:80','','',None,None))
# As usual, need to check bytes input as well
self.assertEqual(urllib.parse.urlparse(b"path:80"),
(b'',b'',b'path:80',b'',b'',b''))
self.assertEqual(urllib.parse.urlparse(b"http:"),(b'http',b'',b'',b'',b'',b''))
self.assertEqual(urllib.parse.urlparse(b"https:"),(b'https',b'',b'',b'',b'',b''))
(b'',b'',b'path:80',b'',None,None))
self.assertEqual(urllib.parse.urlparse(b"http:"),(b'http',b'',b'',b'',None,None))
self.assertEqual(urllib.parse.urlparse(b"https:"),(b'https',b'',b'',b'',None,None))
self.assertEqual(urllib.parse.urlparse(b"http://www.python.org:80"),
(b'http',b'www.python.org:80',b'',b'',b'',b''))
(b'http',b'www.python.org:80',b'',b'',None,None))

def test_usingsys(self):
# Issue 3314: sys module is used in the error
Expand All @@ -730,23 +739,23 @@ def test_usingsys(self):
def test_anyscheme(self):
# Issue 7904: s3://foo.com/stuff has netloc "foo.com".
self.assertEqual(urllib.parse.urlparse("s3://foo.com/stuff"),
('s3', 'foo.com', '/stuff', '', '', ''))
('s3', 'foo.com', '/stuff', '', None, None))
self.assertEqual(urllib.parse.urlparse("x-newscheme://foo.com/stuff"),
('x-newscheme', 'foo.com', '/stuff', '', '', ''))
('x-newscheme', 'foo.com', '/stuff', '', None, None))
self.assertEqual(urllib.parse.urlparse("x-newscheme://foo.com/stuff?query#fragment"),
('x-newscheme', 'foo.com', '/stuff', '', 'query', 'fragment'))
self.assertEqual(urllib.parse.urlparse("x-newscheme://foo.com/stuff?query"),
('x-newscheme', 'foo.com', '/stuff', '', 'query', ''))
('x-newscheme', 'foo.com', '/stuff', '', 'query', None))

# And for bytes...
self.assertEqual(urllib.parse.urlparse(b"s3://foo.com/stuff"),
(b's3', b'foo.com', b'/stuff', b'', b'', b''))
(b's3', b'foo.com', b'/stuff', b'', None, None))
self.assertEqual(urllib.parse.urlparse(b"x-newscheme://foo.com/stuff"),
(b'x-newscheme', b'foo.com', b'/stuff', b'', b'', b''))
(b'x-newscheme', b'foo.com', b'/stuff', b'', None, None))
self.assertEqual(urllib.parse.urlparse(b"x-newscheme://foo.com/stuff?query#fragment"),
(b'x-newscheme', b'foo.com', b'/stuff', b'', b'query', b'fragment'))
self.assertEqual(urllib.parse.urlparse(b"x-newscheme://foo.com/stuff?query"),
(b'x-newscheme', b'foo.com', b'/stuff', b'', b'query', b''))
(b'x-newscheme', b'foo.com', b'/stuff', b'', b'query', None))

def test_default_scheme(self):
# Exercise the scheme parameter of urlparse() and urlsplit()
Expand Down Expand Up @@ -783,10 +792,10 @@ def test_parse_fragments(self):
attr = "path"
with self.subTest(url=url, function=func):
result = func(url, allow_fragments=False)
self.assertEqual(result.fragment, "")
self.assertEqual(result.fragment, None)
self.assertTrue(
getattr(result, attr).endswith("#" + expected_frag))
self.assertEqual(func(url, "", False).fragment, "")
self.assertEqual(func(url, "", False).fragment, None)

result = func(url, allow_fragments=True)
self.assertEqual(result.fragment, expected_frag)
Expand Down
37 changes: 22 additions & 15 deletions Lib/urllib/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,8 @@ def _encode_result(obj, encoding=_implicit_encoding,

def _decode_args(args, encoding=_implicit_encoding,
errors=_implicit_errors):
return tuple(x.decode(encoding, errors) if x else '' for x in args)
return tuple(x.decode(encoding, errors) if x else None if x is None else ''
for x in args)

def _coerce_args(*args):
# Invokes decode if necessary to create str args
Expand All @@ -129,15 +130,19 @@ class _ResultMixinStr(object):
__slots__ = ()

def encode(self, encoding='ascii', errors='strict'):
return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self))
return self._encoded_counterpart(*(x.encode(encoding, errors)
if x is not None else None
for x in self))


class _ResultMixinBytes(object):
"""Standard approach to decoding parsed results from bytes to str"""
__slots__ = ()

def decode(self, encoding='ascii', errors='strict'):
return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self))
return self._decoded_counterpart(*(x.decode(encoding, errors)
if x is not None else None
for x in self))


class _NetlocResultMixinBase(object):
Expand Down Expand Up @@ -428,7 +433,8 @@ def urlsplit(url, scheme='', allow_fragments=True):
return _coerce_result(cached)
if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
clear_cache()
netloc = query = fragment = ''
netloc = ''
query = fragment = None # no components NOR DELIMITERS
i = url.find(':')
if i > 0:
if url[:i] == 'http': # optimize the common case
Expand Down Expand Up @@ -484,20 +490,20 @@ def urlunparse(components):

def urlunsplit(components):
"""Combine the elements of a tuple as returned by urlsplit() into a
complete URL as a string. The data argument can be any five-item iterable.
This may result in a slightly different, but equivalent URL, if the URL that
was parsed originally had unnecessary delimiters (for example, a ? with an
empty query; the RFC states that these are equivalent)."""
complete URL as a string. The data argument can be any five-item
iterable."""
scheme, netloc, url, query, fragment, _coerce_result = (
_coerce_args(*components))
if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
if url and url[:1] != '/': url = '/' + url
url = '//' + (netloc or '') + url
if scheme:
url = scheme + ':' + url
if query:
# keep the delimiter if present (even if the component is empty)
if query is not None:
url = url + '?' + query
if fragment:
# keep the delimiter if present (even if the component is empty)
if fragment is not None:
url = url + '#' + fragment
return _coerce_result(url)

Expand Down Expand Up @@ -573,16 +579,17 @@ def urljoin(base, url, allow_fragments=True):
def urldefrag(url):
"""Removes any existing fragment from URL.

Returns a tuple of the defragmented URL and the fragment. If
the URL contained no fragments, the second element is the
empty string.
Returns a tuple of the defragmented URL and the fragment.
If the URL contained no fragment, the second element is None.
If the URL contained an empty fragment with its '#' delimiter, the second
element is the empty string.
"""
url, _coerce_result = _coerce_args(url)
if '#' in url:
s, n, p, a, q, frag = urlparse(url)
defrag = urlunparse((s, n, p, a, q, ''))
defrag = urlunparse((s, n, p, a, q, None))
else:
frag = ''
frag = None
defrag = url
return _coerce_result(DefragResult(defrag, frag))

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Make the :func:`urllib.parse.urlsplit` and :func:`urllib.parse.urlunsplit`
functions keep the ``?`` delimiter in a URI with an empty query component and
keep the ``#`` delimiter in a URI with an empty fragment component, as required
by `RFC 3986 <https://tools.ietf.org/html/rfc3986?#section-6.2.3>`_. Patch by
Géry Ogam.