From 95a0be3205a8b90652dc174f54c412b8684379d7 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Sun, 8 May 2016 19:53:28 +0100 Subject: [PATCH] fixup! squash! Fix #72: rewrite the sanitizer to be a treewalker filter only. --- html5lib/tests/test_sanitizer.py | 115 +++++++++++-------------------- 1 file changed, 41 insertions(+), 74 deletions(-) diff --git a/html5lib/tests/test_sanitizer.py b/html5lib/tests/test_sanitizer.py index e98c8c85..24a3b607 100644 --- a/html5lib/tests/test_sanitizer.py +++ b/html5lib/tests/test_sanitizer.py @@ -1,52 +1,43 @@ from __future__ import absolute_import, division, unicode_literals -try: - import json -except ImportError: - import simplejson as json - -from html5lib import html5parser, sanitizer, constants, treebuilders - - -def toxmlFactory(): - tree = treebuilders.getTreeBuilder("etree") - - def toxml(element): - # encode/decode roundtrip required for Python 2.6 compatibility - result_bytes = tree.implementation.tostring(element, encoding="utf-8") - return result_bytes.decode("utf-8") - - return toxml - - -def runSanitizerTest(name, expected, input, toxml=None): - if toxml is None: - toxml = toxmlFactory() - expected = ''.join([toxml(token) for token in html5parser.HTMLParser(). - parseFragment(expected)]) - expected = json.loads(json.dumps(expected)) +from html5lib import constants +from html5lib import parseFragment, serialize +from html5lib.filters import sanitizer + + +def runSanitizerTest(name, expected, input): + parsed = parseFragment(expected) + expected = serialize(parsed, + omit_optional_tags=False, + use_trailing_solidus=True, + space_before_trailing_solidus=False, + quote_attr_values=True, + quote_char='"') assert expected == sanitize_html(input) -def sanitize_html(stream, toxml=None): - if toxml is None: - toxml = toxmlFactory() - return ''.join([toxml(token) for token in - html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer). - parseFragment(stream)]) +def sanitize_html(stream): + parsed = parseFragment(stream) + serialized = serialize(parsed, + sanitize=True, + omit_optional_tags=False, + use_trailing_solidus=True, + space_before_trailing_solidus=False, + quote_attr_values=True, + quote_char='"') + return serialized def test_should_handle_astral_plane_characters(): - assert '\U0001d4b5 \U0001d538' == sanitize_html("

𝒵 𝔸

") + assert '

\U0001d4b5 \U0001d538

' == sanitize_html("

𝒵 𝔸

") def test_should_allow_relative_uris(): - assert '' == sanitize_html('

') + assert '

' == sanitize_html('

') def test_sanitizer(): - toxml = toxmlFactory() - for tag_name in sanitizer.HTMLSanitizer.allowed_elements: + for ns, tag_name in sanitizer.allowed_elements: if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr']: continue # TODO if tag_name != tag_name.lower(): @@ -54,81 +45,57 @@ def test_sanitizer(): if tag_name == 'image': yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, "foo <bad>bar</bad> baz", - "<%s title='1'>foo bar baz" % (tag_name, tag_name), - toxml) + "<%s title='1'>foo bar baz" % (tag_name, tag_name)) elif tag_name == 'br': yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, "
foo <bad>bar</bad> baz
", - "<%s title='1'>foo bar baz" % (tag_name, tag_name), - toxml) + "<%s title='1'>foo bar baz" % (tag_name, tag_name)) elif tag_name in constants.voidElements: yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, "<%s title=\"1\"/>foo <bad>bar</bad> baz" % tag_name, - "<%s title='1'>foo bar baz" % (tag_name, tag_name), - toxml) + "<%s title='1'>foo bar baz" % (tag_name, tag_name)) else: yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, "<%s title=\"1\">foo <bad>bar</bad> baz" % (tag_name, tag_name), - "<%s title='1'>foo bar baz" % (tag_name, tag_name), - toxml) + "<%s title='1'>foo bar baz" % (tag_name, tag_name)) - for tag_name in sanitizer.HTMLSanitizer.allowed_elements: - tag_name = tag_name.upper() - yield (runSanitizerTest, "test_should_forbid_%s_tag" % tag_name, - "<%s title=\"1\">foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), - "<%s title='1'>foo bar baz" % (tag_name, tag_name), - toxml) - - for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes: + for ns, attribute_name in sanitizer.allowed_attributes: if attribute_name != attribute_name.lower(): continue # TODO if attribute_name == 'style': continue attribute_value = 'foo' - if attribute_name in sanitizer.HTMLSanitizer.attr_val_is_uri: - attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.HTMLSanitizer.allowed_protocols[0] + if attribute_name in sanitizer.attr_val_is_uri: + attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.allowed_protocols[0] yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name, "

foo <bad>bar</bad> baz

" % (attribute_name, attribute_value), - "

foo bar baz

" % (attribute_name, attribute_value), - toxml) - - for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes: - attribute_name = attribute_name.upper() - yield (runSanitizerTest, "test_should_forbid_%s_attribute" % attribute_name, - "

foo <bad>bar</bad> baz

", - "

foo bar baz

" % attribute_name, - toxml) + "

foo bar baz

" % (attribute_name, attribute_value)) - for protocol in sanitizer.HTMLSanitizer.allowed_protocols: + for protocol in sanitizer.allowed_protocols: rest_of_uri = '//sub.domain.tld/path/object.ext' if protocol == 'data': rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ=' yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol, "foo" % (protocol, rest_of_uri), - """foo""" % (protocol, rest_of_uri), - toxml) + """foo""" % (protocol, rest_of_uri)) yield (runSanitizerTest, "test_invalid_data_uri", "", - "", - toxml) + "") yield (runSanitizerTest, "test_invalid_ipv6_url", "", - "", - toxml) + "") yield (runSanitizerTest, "test_data_uri_disallowed_type", "", - "", - toxml) + "") - for protocol in sanitizer.HTMLSanitizer.allowed_protocols: + for protocol in sanitizer.allowed_protocols: rest_of_uri = '//sub.domain.tld/path/object.ext' if protocol == 'data': rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ=' protocol = protocol.upper() yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol, "foo" % (protocol, rest_of_uri), - """foo""" % (protocol, rest_of_uri), - toxml) + """foo""" % (protocol, rest_of_uri))