Skip to content

Commit

Permalink
Unescapes html in PageParser.href_match_to_url
Browse files Browse the repository at this point in the history
PageParser breaks if the links contain any escaped characters. This fixes that
bug.
  • Loading branch information
daveFNbuck committed Jan 2, 2016
1 parent fcdee8a commit 8da3523
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 1 deletion.
11 changes: 10 additions & 1 deletion pex/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,15 @@
from urlparse import urlparse


def unescape(s):
"""Unescapes html. Taken from https://wiki.python.org/moin/EscapingHtml"""
s = s.replace("&lt;", "<")
s = s.replace("&gt;", ">")
# this has to be last:
s = s.replace("&amp;", "&")
return s


class PageParser(object):
"""A helper class to extract and differentiate ordinary and download links from webpages."""

Expand All @@ -34,7 +43,7 @@ class PageParser(object):
def href_match_to_url(cls, match):
def pick(group):
return '' if group is None else group
return pick(match.group(1)) or pick(match.group(2)) or pick(match.group(3))
return unescape(pick(match.group(1)) or pick(match.group(2)) or pick(match.group(3)))

@classmethod
def rel_links(cls, page):
Expand Down
6 changes: 6 additions & 0 deletions tests/test_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ def test_page_parser_basic():
assert lpp("<a href='stuff'> <a href=%s>" % target) == (['stuff', href], [])


def test_page_parser_escaped_html():
url = 'url?param1=val&param2=val2'
link = 'a href="%s"' % url.replace('&', '&amp;')
assert lpp(link) == ([url], [])


def test_page_parser_rels():
VALID_RELS = tuple(PageParser.REL_TYPES)
for rel in VALID_RELS + ('', ' ', 'blah'):
Expand Down

0 comments on commit 8da3523

Please sign in to comment.