-
Notifications
You must be signed in to change notification settings - Fork 286
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fixup! squash! Fix #72: rewrite the sanitizer to be a treewalker filt…
…er only.
- Loading branch information
Showing
1 changed file
with
41 additions
and
74 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,134 +1,101 @@ | ||
from __future__ import absolute_import, division, unicode_literals | ||
|
||
try: | ||
import json | ||
except ImportError: | ||
import simplejson as json | ||
|
||
from html5lib import html5parser, sanitizer, constants, treebuilders | ||
|
||
|
||
def toxmlFactory(): | ||
tree = treebuilders.getTreeBuilder("etree") | ||
|
||
def toxml(element): | ||
# encode/decode roundtrip required for Python 2.6 compatibility | ||
result_bytes = tree.implementation.tostring(element, encoding="utf-8") | ||
return result_bytes.decode("utf-8") | ||
|
||
return toxml | ||
|
||
|
||
def runSanitizerTest(name, expected, input, toxml=None): | ||
if toxml is None: | ||
toxml = toxmlFactory() | ||
expected = ''.join([toxml(token) for token in html5parser.HTMLParser(). | ||
parseFragment(expected)]) | ||
expected = json.loads(json.dumps(expected)) | ||
from html5lib import constants | ||
from html5lib import parseFragment, serialize | ||
from html5lib.filters import sanitizer | ||
|
||
|
||
def runSanitizerTest(name, expected, input): | ||
parsed = parseFragment(expected) | ||
expected = serialize(parsed, | ||
omit_optional_tags=False, | ||
use_trailing_solidus=True, | ||
space_before_trailing_solidus=False, | ||
quote_attr_values=True, | ||
quote_char='"') | ||
assert expected == sanitize_html(input) | ||
|
||
|
||
def sanitize_html(stream, toxml=None): | ||
if toxml is None: | ||
toxml = toxmlFactory() | ||
return ''.join([toxml(token) for token in | ||
html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer). | ||
parseFragment(stream)]) | ||
def sanitize_html(stream): | ||
parsed = parseFragment(stream) | ||
serialized = serialize(parsed, | ||
sanitize=True, | ||
omit_optional_tags=False, | ||
use_trailing_solidus=True, | ||
space_before_trailing_solidus=False, | ||
quote_attr_values=True, | ||
quote_char='"') | ||
return serialized | ||
|
||
|
||
def test_should_handle_astral_plane_characters(): | ||
assert '<html:p xmlns:html="http://www.w3.org/1999/xhtml">\U0001d4b5 \U0001d538</html:p>' == sanitize_html("<p>𝒵 𝔸</p>") | ||
assert '<p>\U0001d4b5 \U0001d538</p>' == sanitize_html("<p>𝒵 𝔸</p>") | ||
|
||
|
||
def test_should_allow_relative_uris(): | ||
assert '<html:p xmlns:html="http://www.w3.org/1999/xhtml"><html:a href="/example.com" /></html:p>' == sanitize_html('<p><a href="/example.com"></a></p>') | ||
assert '<p><a href="/example.com"></a></p>' == sanitize_html('<p><a href="/example.com"></a></p>') | ||
|
||
|
||
def test_sanitizer(): | ||
toxml = toxmlFactory() | ||
for tag_name in sanitizer.HTMLSanitizer.allowed_elements: | ||
for ns, tag_name in sanitizer.allowed_elements: | ||
if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr']: | ||
continue # TODO | ||
if tag_name != tag_name.lower(): | ||
continue # TODO | ||
if tag_name == 'image': | ||
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, | ||
"<img title=\"1\"/>foo <bad>bar</bad> baz", | ||
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), | ||
toxml) | ||
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name)) | ||
elif tag_name == 'br': | ||
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, | ||
"<br title=\"1\"/>foo <bad>bar</bad> baz<br/>", | ||
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), | ||
toxml) | ||
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name)) | ||
elif tag_name in constants.voidElements: | ||
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, | ||
"<%s title=\"1\"/>foo <bad>bar</bad> baz" % tag_name, | ||
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), | ||
toxml) | ||
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name)) | ||
else: | ||
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, | ||
"<%s title=\"1\">foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), | ||
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), | ||
toxml) | ||
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name)) | ||
|
||
for tag_name in sanitizer.HTMLSanitizer.allowed_elements: | ||
tag_name = tag_name.upper() | ||
yield (runSanitizerTest, "test_should_forbid_%s_tag" % tag_name, | ||
"<%s title=\"1\">foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), | ||
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), | ||
toxml) | ||
|
||
for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes: | ||
for ns, attribute_name in sanitizer.allowed_attributes: | ||
if attribute_name != attribute_name.lower(): | ||
continue # TODO | ||
if attribute_name == 'style': | ||
continue | ||
attribute_value = 'foo' | ||
if attribute_name in sanitizer.HTMLSanitizer.attr_val_is_uri: | ||
attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.HTMLSanitizer.allowed_protocols[0] | ||
if attribute_name in sanitizer.attr_val_is_uri: | ||
attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.allowed_protocols[0] | ||
yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name, | ||
"<p %s=\"%s\">foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value), | ||
"<p %s='%s'>foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value), | ||
toxml) | ||
|
||
for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes: | ||
attribute_name = attribute_name.upper() | ||
yield (runSanitizerTest, "test_should_forbid_%s_attribute" % attribute_name, | ||
"<p>foo <bad>bar</bad> baz</p>", | ||
"<p %s='display: none;'>foo <bad>bar</bad> baz</p>" % attribute_name, | ||
toxml) | ||
"<p %s='%s'>foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value)) | ||
|
||
for protocol in sanitizer.HTMLSanitizer.allowed_protocols: | ||
for protocol in sanitizer.allowed_protocols: | ||
rest_of_uri = '//sub.domain.tld/path/object.ext' | ||
if protocol == 'data': | ||
rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ=' | ||
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol, | ||
"<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri), | ||
"""<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri), | ||
toxml) | ||
"""<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri)) | ||
|
||
yield (runSanitizerTest, "test_invalid_data_uri", | ||
"<audio controls=\"\"></audio>", | ||
"<audio controls=\"\" src=\"data:foobar\"></audio>", | ||
toxml) | ||
"<audio controls=\"\" src=\"data:foobar\"></audio>") | ||
|
||
yield (runSanitizerTest, "test_invalid_ipv6_url", | ||
"<a>", | ||
"<a href=\"h://]\">", | ||
toxml) | ||
"<a href=\"h://]\">") | ||
|
||
yield (runSanitizerTest, "test_data_uri_disallowed_type", | ||
"<audio controls=\"\"></audio>", | ||
"<audio controls=\"\" src=\"data:text/html,<html>\"></audio>", | ||
toxml) | ||
"<audio controls=\"\" src=\"data:text/html,<html>\"></audio>") | ||
|
||
for protocol in sanitizer.HTMLSanitizer.allowed_protocols: | ||
for protocol in sanitizer.allowed_protocols: | ||
rest_of_uri = '//sub.domain.tld/path/object.ext' | ||
if protocol == 'data': | ||
rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ=' | ||
protocol = protocol.upper() | ||
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol, | ||
"<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri), | ||
"""<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri), | ||
toxml) | ||
"""<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri)) |