fixup! squash! Fix #72: rewrite the sanitizer to be a treewalker filt…

…er only.
html5lib · May 8, 2016 · 95a0be3 · 95a0be3
1 parent 08a5eca
commit 95a0be3
Showing 1 changed file with 41 additions and 74 deletions.
diff --git a/html5lib/tests/test_sanitizer.py b/html5lib/tests/test_sanitizer.py
@@ -1,134 +1,101 @@
 from __future__ import absolute_import, division, unicode_literals
 
-try:
-    import json
-except ImportError:
-    import simplejson as json
-
-from html5lib import html5parser, sanitizer, constants, treebuilders
-
-
-def toxmlFactory():
-    tree = treebuilders.getTreeBuilder("etree")
-
-    def toxml(element):
-        # encode/decode roundtrip required for Python 2.6 compatibility
-        result_bytes = tree.implementation.tostring(element, encoding="utf-8")
-        return result_bytes.decode("utf-8")
-
-    return toxml
-
-
-def runSanitizerTest(name, expected, input, toxml=None):
-    if toxml is None:
-        toxml = toxmlFactory()
-    expected = ''.join([toxml(token) for token in html5parser.HTMLParser().
-                        parseFragment(expected)])
-    expected = json.loads(json.dumps(expected))
+from html5lib import constants
+from html5lib import parseFragment, serialize
+from html5lib.filters import sanitizer
+
+
+def runSanitizerTest(name, expected, input):
+    parsed = parseFragment(expected)
+    expected = serialize(parsed,
+                         omit_optional_tags=False,
+                         use_trailing_solidus=True,
+                         space_before_trailing_solidus=False,
+                         quote_attr_values=True,
+                         quote_char='"')
     assert expected == sanitize_html(input)
 
 
-def sanitize_html(stream, toxml=None):
-    if toxml is None:
-        toxml = toxmlFactory()
-    return ''.join([toxml(token) for token in
-                    html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer).
-                    parseFragment(stream)])
+def sanitize_html(stream):
+    parsed = parseFragment(stream)
+    serialized = serialize(parsed,
+                           sanitize=True,
+                           omit_optional_tags=False,
+                           use_trailing_solidus=True,
+                           space_before_trailing_solidus=False,
+                           quote_attr_values=True,
+                           quote_char='"')
+    return serialized
 
 
 def test_should_handle_astral_plane_characters():
-    assert '<html:p xmlns:html="http://www.w3.org/1999/xhtml">\U0001d4b5 \U0001d538</html:p>' == sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")
+    assert '<p>\U0001d4b5 \U0001d538</p>' == sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")
 
 
 def test_should_allow_relative_uris():
-    assert '<html:p xmlns:html="http://www.w3.org/1999/xhtml"><html:a href="/example.com" /></html:p>' == sanitize_html('<p><a href="/example.com"></a></p>')
+    assert '<p><a href="/example.com"></a></p>' == sanitize_html('<p><a href="/example.com"></a></p>')
 
 
 def test_sanitizer():
-    toxml = toxmlFactory()
-    for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
+    for ns, tag_name in sanitizer.allowed_elements:
         if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr']:
             continue  # TODO
         if tag_name != tag_name.lower():
             continue  # TODO
         if tag_name == 'image':
             yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
                    "<img title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz",
-                   "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
-                   toxml)
+                   "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
         elif tag_name == 'br':
             yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
                    "<br title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz<br/>",
-                   "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
-                   toxml)
+                   "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
         elif tag_name in constants.voidElements:
             yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
                    "<%s title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz" % tag_name,
-                   "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
-                   toxml)
+                   "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
         else:
             yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
                    "<%s title=\"1\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</%s>" % (tag_name, tag_name),
-                   "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
-                   toxml)
+                   "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
 
-    for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
-        tag_name = tag_name.upper()
-        yield (runSanitizerTest, "test_should_forbid_%s_tag" % tag_name,
-               "&lt;%s title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/%s&gt;" % (tag_name, tag_name),
-               "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
-               toxml)
-
-    for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
+    for ns, attribute_name in sanitizer.allowed_attributes:
         if attribute_name != attribute_name.lower():
             continue  # TODO
         if attribute_name == 'style':
             continue
         attribute_value = 'foo'
-        if attribute_name in sanitizer.HTMLSanitizer.attr_val_is_uri:
-            attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.HTMLSanitizer.allowed_protocols[0]
+        if attribute_name in sanitizer.attr_val_is_uri:
+            attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.allowed_protocols[0]
         yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name,
                "<p %s=\"%s\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % (attribute_name, attribute_value),
-               "<p %s='%s'>foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value),
-               toxml)
-
-    for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
-        attribute_name = attribute_name.upper()
-        yield (runSanitizerTest, "test_should_forbid_%s_attribute" % attribute_name,
-               "<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>",
-               "<p %s='display: none;'>foo <bad>bar</bad> baz</p>" % attribute_name,
-               toxml)
+               "<p %s='%s'>foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value))
 
-    for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
+    for protocol in sanitizer.allowed_protocols:
         rest_of_uri = '//sub.domain.tld/path/object.ext'
         if protocol == 'data':
             rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
         yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
                "<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
-               """<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri),
-               toxml)
+               """<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri))
 
     yield (runSanitizerTest, "test_invalid_data_uri",
            "<audio controls=\"\"></audio>",
-           "<audio controls=\"\" src=\"data:foobar\"></audio>",
-           toxml)
+           "<audio controls=\"\" src=\"data:foobar\"></audio>")
 
     yield (runSanitizerTest, "test_invalid_ipv6_url",
            "<a>",
-           "<a href=\"h://]\">",
-           toxml)
+           "<a href=\"h://]\">")
 
     yield (runSanitizerTest, "test_data_uri_disallowed_type",
            "<audio controls=\"\"></audio>",
-           "<audio controls=\"\" src=\"data:text/html,<html>\"></audio>",
-           toxml)
+           "<audio controls=\"\" src=\"data:text/html,<html>\"></audio>")
 
-    for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
+    for protocol in sanitizer.allowed_protocols:
         rest_of_uri = '//sub.domain.tld/path/object.ext'
         if protocol == 'data':
             rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
         protocol = protocol.upper()
         yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
                "<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
-               """<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri),
-               toxml)
+               """<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri))