Skip to content

Commit

Permalink
Merge pull request #10378 from jdufresne/unescape
Browse files Browse the repository at this point in the history
Remove unnecessary html.unescape() calls in index/collector.py
  • Loading branch information
uranusjr authored Sep 5, 2021
2 parents b8fc219 + af34057 commit b9fcd55
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 23 deletions.
1 change: 1 addition & 0 deletions news/10378.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix double unescape of HTML ``data-requires-python`` and ``data-yanked`` attributes.
5 changes: 0 additions & 5 deletions src/pip/_internal/index/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import cgi
import collections
import functools
import html
import itertools
import logging
import os
Expand Down Expand Up @@ -248,11 +247,7 @@ def _create_link_from_element(

url = _clean_link(urllib.parse.urljoin(base_url, href))
pyrequire = anchor.get("data-requires-python")
pyrequire = html.unescape(pyrequire) if pyrequire else None

yanked_reason = anchor.get("data-yanked")
if yanked_reason:
yanked_reason = html.unescape(yanked_reason)

link = Link(
url,
Expand Down
63 changes: 45 additions & 18 deletions tests/unit/test_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,45 @@ def test_clean_link(url, clean_url):
assert _clean_link(url) == clean_url


def _test_parse_links_data_attribute(anchor_html, attr, expected):
html = f'<html><head><meta charset="utf-8"><head><body>{anchor_html}</body></html>'
html_bytes = html.encode("utf-8")
page = HTMLPage(
html_bytes,
encoding=None,
# parse_links() is cached by url, so we inject a random uuid to ensure
# the page content isn't cached.
url=f"https://example.com/simple-{uuid.uuid4()}/",
)
links = list(parse_links(page))
(link,) = links
actual = getattr(link, attr)
assert actual == expected


@pytest.mark.parametrize(
"anchor_html, expected",
[
# Test not present.
('<a href="/pkg-1.0.tar.gz"></a>', None),
# Test present with no value.
('<a href="/pkg-1.0.tar.gz" data-requires-python></a>', None),
# Test a value with an escaped character.
(
'<a href="/pkg-1.0.tar.gz" data-requires-python="&gt;=3.6"></a>',
">=3.6",
),
# Test requires python is unescaped once.
(
'<a href="/pkg-1.0.tar.gz" data-requires-python="&amp;gt;=3.6"></a>',
"&gt;=3.6",
),
],
)
def test_parse_links__requires_python(anchor_html, expected):
_test_parse_links_data_attribute(anchor_html, "requires_python", expected)


@pytest.mark.parametrize(
"anchor_html, expected",
[
Expand All @@ -428,27 +467,15 @@ def test_clean_link(url, clean_url):
'<a href="/pkg-1.0.tar.gz" data-yanked="curlyquote \u2018"></a>',
"curlyquote \u2018",
),
# Test yanked reason is unescaped once.
(
'<a href="/pkg-1.0.tar.gz" data-yanked="version &amp;lt; 1"></a>',
"version &lt; 1",
),
],
)
def test_parse_links__yanked_reason(anchor_html, expected):
html = (
# Mark this as a unicode string for Python 2 since anchor_html
# can contain non-ascii.
'<html><head><meta charset="utf-8"><head>'
"<body>{}</body></html>"
).format(anchor_html)
html_bytes = html.encode("utf-8")
page = HTMLPage(
html_bytes,
encoding=None,
# parse_links() is cached by url, so we inject a random uuid to ensure
# the page content isn't cached.
url=f"https://example.com/simple-{uuid.uuid4()}/",
)
links = list(parse_links(page))
(link,) = links
actual = link.yanked_reason
assert actual == expected
_test_parse_links_data_attribute(anchor_html, "yanked_reason", expected)


def test_parse_links_caches_same_page_by_url():
Expand Down

0 comments on commit b9fcd55

Please sign in to comment.