diff --git a/CHANGES/516.bugfix b/CHANGES/516.bugfix new file mode 100644 index 000000000..fbffe1dbb --- /dev/null +++ b/CHANGES/516.bugfix @@ -0,0 +1 @@ +Fix ValueError when decoding ``%`` which is not followed by two hexadecimal digits. \ No newline at end of file diff --git a/CHANGES/520.bugfix b/CHANGES/520.bugfix new file mode 100644 index 000000000..54b375220 --- /dev/null +++ b/CHANGES/520.bugfix @@ -0,0 +1 @@ +Fix decoding ``%`` followed by a space and hexadecimal digit. diff --git a/tests/test_quoting.py b/tests/test_quoting.py index 7d33a80d6..bb3375c00 100644 --- a/tests/test_quoting.py +++ b/tests/test_quoting.py @@ -182,25 +182,29 @@ def test_unquoting(num, unquoter): assert expect == result -@pytest.mark.xfail -# FIXME: Expected value should be the same as given. -# See https://url.spec.whatwg.org/#percent-encoded-bytes -def test_unquoting_bad_percent_escapes_1(unquoter): - assert "%" == unquoter()("%") - - -@pytest.mark.xfail -# FIXME: Expected value should be the same as given. -# See https://url.spec.whatwg.org/#percent-encoded-bytes -def test_unquoting_bad_percent_escapes_2(unquoter): - assert "%x" == unquoter()("%x") - - -@pytest.mark.xfail -# FIXME: Expected value should be the same as given. +# Expected value should be the same as given. # See https://url.spec.whatwg.org/#percent-encoded-bytes -def test_unquoting_bad_percent_escapes_3(unquoter): - assert "%xa" == unquoter()("%xa") +@pytest.mark.parametrize( + ("input", "expected"), + [ + ("%", "%"), + ("%2", "%2"), + ("%x", "%x"), + ("%€", "%€"), + ("%2x", "%2x"), + ("%2 ", "%2 "), + ("% 2", "% 2"), + ("%xa", "%xa"), + ("%%", "%%"), + ("%%3f", "%?"), + ("%2%", "%2%"), + ("%2%3f", "%2?"), + ("%x%3f", "%x?"), + ("%€%3f", "%€?"), + ], +) +def test_unquoting_bad_percent_escapes(unquoter, input, expected): + assert unquoter()(input) == expected @pytest.mark.xfail diff --git a/yarl/_quoting_c.pyx b/yarl/_quoting_c.pyx index 8308f0b18..557e40611 100644 --- a/yarl/_quoting_c.pyx +++ b/yarl/_quoting_c.pyx @@ -328,19 +328,17 @@ cdef class _Unquoter: cdef str _do_unquote(self, str val): if len(val) == 0: return val - cdef str pct = '' cdef str last_pct = '' cdef bytearray pcts = bytearray() cdef list ret = [] cdef str unquoted - for ch in val: - if pct: - pct += ch - if len(pct) == 3: # pragma: no branch # peephole optimizer - pcts.append(int(pct[1:], base=16)) - last_pct = pct - pct = '' - continue + cdef Py_UCS4 ch = 0 + cdef int idx = 0 + cdef int length = len(val) + + while idx < length: + ch = val[idx] + idx += 1 if pcts: try: unquoted = pcts.decode('utf8') @@ -355,8 +353,14 @@ cdef class _Unquoter: ret.append(unquoted) del pcts[:] - if ch == '%': - pct = ch + if ch == '%' and idx <= length - 2: + ch = _restore_ch(val[idx], val[idx + 1]) + if ch == -1: + ret.append("%") + else: + pcts.append(ch) + last_pct = val[idx - 1 : idx + 2] + idx += 2 continue if pcts: diff --git a/yarl/_quoting_py.py b/yarl/_quoting_py.py index 4c54bef6f..cd16aa3a2 100644 --- a/yarl/_quoting_py.py +++ b/yarl/_quoting_py.py @@ -14,6 +14,7 @@ _IS_HEX = re.compile(b"[A-Z0-9][A-Z0-9]") +_IS_HEX_STR = re.compile("[A-Fa-f0-9][A-Fa-f0-9]") class _Quoter: @@ -126,18 +127,13 @@ def __call__(self, val: Optional[str]) -> Optional[str]: raise TypeError("Argument should be str") if not val: return "" - pct = "" last_pct = "" pcts = bytearray() ret = [] - for ch in val: - if pct: - pct += ch - if len(pct) == 3: # pragma: no branch # peephole optimizer - pcts.append(int(pct[1:], base=16)) - last_pct = pct - pct = "" - continue + idx = 0 + while idx < len(val): + ch = val[idx] + idx += 1 if pcts: try: unquoted = pcts.decode("utf8") @@ -158,9 +154,17 @@ def __call__(self, val: Optional[str]) -> Optional[str]: ret.append(unquoted) del pcts[:] - if ch == "%": - pct = ch - continue + if ch == "%" and idx <= len(val) - 2: + pct = val[idx : idx + 2] # noqa: E203 + if _IS_HEX_STR.fullmatch(pct): + try: + pcts.append(int(pct, base=16)) + except ValueError: + ret.append("%") + else: + last_pct = "%" + pct + idx += 2 + continue if pcts: ret.append(last_pct) # %F8ab