From 29231a10eb983e25b59c8edc5b6abcb12dbaaabe Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Sat, 21 Jan 2023 17:51:59 -0500 Subject: [PATCH 1/5] Convert tags, skip_tags, recognized_tags to sets; fix doctests; f-strings This converts the "tags" argument to BleachHTMLParser to be a set. This converts the "skip_tags" and "recognized_tags" to linkify things to be sets. This updates the documentation fixing example code so that tags, skip_tags, and recognized_tags are all sets. This also converts some string interpolation from %s style to f-strings. --- bleach/html5lib_shim.py | 242 ++++++++++++++++++++-------------------- bleach/linkifier.py | 14 +-- bleach/sanitizer.py | 35 +++--- docs/clean.rst | 31 ++--- docs/linkify.rst | 10 +- 5 files changed, 166 insertions(+), 166 deletions(-) diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py index 022fe19e..aa5189b1 100644 --- a/bleach/html5lib_shim.py +++ b/bleach/html5lib_shim.py @@ -81,127 +81,129 @@ #: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17 #: https://html.spec.whatwg.org/multipage/indices.html#elements-3 -HTML_TAGS = [ - "a", - "abbr", - "address", - "area", - "article", - "aside", - "audio", - "b", - "base", - "bdi", - "bdo", - "blockquote", - "body", - "br", - "button", - "canvas", - "caption", - "cite", - "code", - "col", - "colgroup", - "data", - "datalist", - "dd", - "del", - "details", - "dfn", - "dialog", - "div", - "dl", - "dt", - "em", - "embed", - "fieldset", - "figcaption", - "figure", - "footer", - "form", - "h1", - "h2", - "h3", - "h4", - "h5", - "h6", - "head", - "header", - "hgroup", - "hr", - "html", - "i", - "iframe", - "img", - "input", - "ins", - "kbd", - "keygen", - "label", - "legend", - "li", - "link", - "map", - "mark", - "menu", - "meta", - "meter", - "nav", - "noscript", - "object", - "ol", - "optgroup", - "option", - "output", - "p", - "param", - "picture", - "pre", - "progress", - "q", - "rp", - "rt", - "ruby", - "s", - "samp", - "script", - "section", - "select", - "slot", - "small", - "source", - "span", - "strong", - "style", - "sub", - "summary", - "sup", - "table", - "tbody", - "td", - "template", - "textarea", - "tfoot", - "th", - "thead", - "time", - "title", - "tr", - "track", - "u", - "ul", - "var", - "video", - "wbr", -] +HTML_TAGS = frozenset( + ( + "a", + "abbr", + "address", + "area", + "article", + "aside", + "audio", + "b", + "base", + "bdi", + "bdo", + "blockquote", + "body", + "br", + "button", + "canvas", + "caption", + "cite", + "code", + "col", + "colgroup", + "data", + "datalist", + "dd", + "del", + "details", + "dfn", + "dialog", + "div", + "dl", + "dt", + "em", + "embed", + "fieldset", + "figcaption", + "figure", + "footer", + "form", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "head", + "header", + "hgroup", + "hr", + "html", + "i", + "iframe", + "img", + "input", + "ins", + "kbd", + "keygen", + "label", + "legend", + "li", + "link", + "map", + "mark", + "menu", + "meta", + "meter", + "nav", + "noscript", + "object", + "ol", + "optgroup", + "option", + "output", + "p", + "param", + "picture", + "pre", + "progress", + "q", + "rp", + "rt", + "ruby", + "s", + "samp", + "script", + "section", + "select", + "slot", + "small", + "source", + "span", + "strong", + "style", + "sub", + "summary", + "sup", + "table", + "tbody", + "td", + "template", + "textarea", + "tfoot", + "th", + "thead", + "time", + "title", + "tr", + "track", + "u", + "ul", + "var", + "video", + "wbr", + ) +) #: List of block level HTML tags, as per https://github.com/mozilla/bleach/issues/369 #: from mozilla on 2019.07.11 #: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements HTML_TAGS_BLOCK_LEVEL = frozenset( - [ + ( "address", "article", "aside", @@ -235,7 +237,7 @@ "section", "table", "ul", - ] + ) ) @@ -476,7 +478,7 @@ class BleachHTMLParser(HTMLParser): def __init__(self, tags, strip, consume_entities, **kwargs): """ - :arg tags: list of allowed tags--everything else is either stripped or + :arg tags: set of allowed tags--everything else is either stripped or escaped; if None, then this doesn't look at tags at all :arg strip: whether to strip disallowed tags (True) or escape them (False); if tags=None, then this doesn't have any effect @@ -484,7 +486,9 @@ def __init__(self, tags, strip, consume_entities, **kwargs): leave them as is when tokenizing (BleachHTMLTokenizer-added behavior) """ - self.tags = [tag.lower() for tag in tags] if tags is not None else None + self.tags = ( + frozenset((tag.lower() for tag in tags)) if tags is not None else None + ) self.strip = strip self.consume_entities = consume_entities super().__init__(**kwargs) @@ -694,7 +698,7 @@ def escape_base_amp(self, stoken): # Only leave entities in that are not ambiguous. If they're # ambiguous, then we escape the ampersand. if entity is not None and convert_entity(entity) is not None: - yield "&" + entity + ";" + yield f"&{entity};" # Length of the entity plus 2--one for & at the beginning # and one for ; at the end diff --git a/bleach/linkifier.py b/bleach/linkifier.py index 343f374f..183eb5ba 100644 --- a/bleach/linkifier.py +++ b/bleach/linkifier.py @@ -120,8 +120,8 @@ def __init__( :arg list callbacks: list of callbacks to run when adjusting tag attributes; defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` - :arg list skip_tags: list of tags that you don't want to linkify the - contents of; for example, you could set this to ``['pre']`` to skip + :arg set skip_tags: set of tags that you don't want to linkify the + contents of; for example, you could set this to ``{'pre'}`` to skip linkifying contents of ``pre`` tags :arg bool parse_email: whether or not to linkify email addresses @@ -130,7 +130,7 @@ def __init__( :arg email_re: email matching regex - :arg list recognized_tags: the list of tags that linkify knows about; + :arg set recognized_tags: the list of tags that linkify knows about; everything else gets escaped :returns: linkified text as unicode @@ -145,7 +145,7 @@ def __init__( # Create a parser/tokenizer that allows all HTML tags and escapes # anything not in that list. self.parser = html5lib_shim.BleachHTMLParser( - tags=recognized_tags, + tags=frozenset(recognized_tags), strip=False, consume_entities=False, namespaceHTMLElements=False, @@ -221,8 +221,8 @@ def __init__( :arg list callbacks: list of callbacks to run when adjusting tag attributes; defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` - :arg list skip_tags: list of tags that you don't want to linkify the - contents of; for example, you could set this to ``['pre']`` to skip + :arg set skip_tags: set of tags that you don't want to linkify the + contents of; for example, you could set this to ``{'pre'}`` to skip linkifying contents of ``pre`` tags :arg bool parse_email: whether or not to linkify email addresses @@ -235,7 +235,7 @@ def __init__( super().__init__(source) self.callbacks = callbacks or [] - self.skip_tags = skip_tags or [] + self.skip_tags = skip_tags or {} self.parse_email = parse_email self.url_re = url_re diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index 35ccf71e..8662a879 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -35,7 +35,7 @@ } #: List of allowed protocols -ALLOWED_PROTOCOLS = ["http", "https", "mailto"] +ALLOWED_PROTOCOLS = frozenset(("http", "https", "mailto")) #: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr) INVISIBLE_CHARACTERS = "".join( @@ -180,9 +180,8 @@ def clean(self, text): """ if not isinstance(text, str): message = ( - "argument cannot be of '{name}' type, must be of text type".format( - name=text.__class__.__name__ - ) + f"argument cannot be of {text.__class__.__name__!r} type, " + + "must be of text type" ) raise TypeError(message) @@ -308,7 +307,7 @@ def __init__( html5lib_shim.Filter.__init__(self, source) self.allowed_tags = frozenset(allowed_tags) - self.allowed_protocols = allowed_protocols + self.allowed_protocols = frozenset(allowed_protocols) self.attr_filter = attribute_filter_factory(attributes) self.strip_disallowed_tags = strip_disallowed_tags @@ -603,7 +602,7 @@ def allow_token(self, token): def disallowed_token(self, token): token_type = token["type"] if token_type == "EndTag": - token["data"] = "" % token["name"] + token["data"] = f"" elif token["data"]: assert token_type in ("StartTag", "EmptyTag") @@ -619,25 +618,19 @@ def disallowed_token(self, token): if ns is None or ns not in html5lib_shim.prefixes: namespaced_name = name else: - namespaced_name = "{}:{}".format(html5lib_shim.prefixes[ns], name) - - attrs.append( - ' %s="%s"' - % ( - namespaced_name, - # NOTE(willkg): HTMLSerializer escapes attribute values - # already, so if we do it here (like HTMLSerializer does), - # then we end up double-escaping. - v, - ) - ) - token["data"] = "<{}{}>".format(token["name"], "".join(attrs)) + namespaced_name = f"{html5lib_shim.prefixes[ns]}:{name}" + + # NOTE(willkg): HTMLSerializer escapes attribute values + # already, so if we do it here (like HTMLSerializer does), + # then we end up double-escaping. + attrs.append(f' {namespaced_name}="{v}"') + token["data"] = f"<{token['name']}{''.join(attrs)}>" else: - token["data"] = "<%s>" % token["name"] + token["data"] = f"<{token['name']}>" if token.get("selfClosing"): - token["data"] = token["data"][:-1] + "/>" + token["data"] = f"{token['data'][:-1]}/>" token["type"] = "Characters" diff --git a/docs/clean.rst b/docs/clean.rst index 9ebf74bb..bcc98112 100644 --- a/docs/clean.rst +++ b/docs/clean.rst @@ -21,14 +21,18 @@ can be used in HTML as is. templates (mustache, handlebars, angular, jsx, etc), JSON, xhtml, SVG, or other contexts. - For example, this is a safe use of ``clean`` output in an HTML context:: + For example, this is a safe use of ``clean`` output in an HTML context: + + .. code-block:: html

{{ bleach.clean(user_bio) }}

- This is **not a safe** use of ``clean`` output in an HTML attribute:: + This is **not a safe** use of ``clean`` output in an HTML attribute: + + .. code-block:: html @@ -106,7 +110,7 @@ For example: >>> bleach.clean( ... '

blah blah blah

', - ... tags=['p'], + ... tags={'p'}, ... attributes=['class'], ... ) '

blah blah blah

' @@ -134,7 +138,7 @@ and "class" for any tag (including "a" and "img"): >>> bleach.clean( ... 'an example', - ... tags=['img'], + ... tags={'img'}, ... attributes=attrs ... ) 'an example' @@ -160,7 +164,7 @@ For example: >>> bleach.clean( ... 'link', - ... tags=['a'], + ... tags={'a'}, ... attributes=allow_h, ... ) 'link' @@ -184,7 +188,7 @@ attributes for specified tags: >>> bleach.clean( ... 'an example', - ... tags=['img'], + ... tags={'img'}, ... attributes={ ... 'img': allow_src ... } @@ -213,7 +217,7 @@ For example, this sets allowed protocols to http, https and smb: >>> bleach.clean( ... 'allowed protocol', - ... protocols=['http', 'https', 'smb'] + ... protocols={'http', 'https', 'smb'} ... ) 'allowed protocol' @@ -224,9 +228,10 @@ This adds smb to the Bleach-specified set of allowed protocols: >>> import bleach + >>> my_protocols = bleach.ALLOWED_PROTOCOLS.union({'smb'}) >>> bleach.clean( ... 'allowed protocol', - ... protocols=bleach.ALLOWED_PROTOCOLS + ['smb'] + ... protocols=my_protocols ... ) 'allowed protocol' @@ -249,7 +254,7 @@ and invalid markup. For example: >>> bleach.clean('is not allowed') '<span>is not allowed</span>' - >>> bleach.clean('is not allowed', tags=['b']) + >>> bleach.clean('is not allowed', tags={'b'}) '<span>is not allowed</span>' @@ -263,7 +268,7 @@ If you would rather Bleach stripped this markup entirely, you can pass >>> bleach.clean('is not allowed', strip=True) 'is not allowed' - >>> bleach.clean('is not allowed', tags=['b'], strip=True) + >>> bleach.clean('is not allowed', tags={'b'}, strip=True) 'is not allowed' @@ -309,7 +314,7 @@ For example: >>> css_sanitizer = CSSSanitizer(allowed_css_properties=["color", "font-weight"]) - >>> tags = ['p', 'em', 'strong'] + >>> tags = {'p', 'em', 'strong'} >>> attrs = { ... '*': ['style'] ... } @@ -386,7 +391,7 @@ Trivial Filter example: ... 'img': ['rel', 'src'] ... } ... - >>> TAGS = ['img'] + >>> TAGS = {'img'} >>> cleaner = Cleaner(tags=TAGS, attributes=ATTRS, filters=[MooFilter]) >>> dirty = 'this is cute! ' >>> cleaner.clean(dirty) @@ -411,5 +416,3 @@ Using ``bleach.sanitizer.BleachSanitizerFilter`` use an html5lib filter. .. autoclass:: bleach.sanitizer.BleachSanitizerFilter - - diff --git a/docs/linkify.rst b/docs/linkify.rst index 13a1ed1c..a65f9e99 100644 --- a/docs/linkify.rst +++ b/docs/linkify.rst @@ -319,7 +319,7 @@ instance. >>> from bleach.linkifier import Linker - >>> linker = Linker(skip_tags=['pre']) + >>> linker = Linker(skip_tags={'pre'}) >>> linker.linkify('a b c http://example.com d e f') 'a b c http://example.com d e f' @@ -410,11 +410,11 @@ For example, using all the defaults: >>> from bleach import Cleaner >>> from bleach.linkifier import LinkifyFilter - >>> cleaner = Cleaner(tags=['pre']) + >>> cleaner = Cleaner(tags={'pre'}) >>> cleaner.clean('
http://example.com
') '
http://example.com
' - >>> cleaner = Cleaner(tags=['pre'], filters=[LinkifyFilter]) + >>> cleaner = Cleaner(tags={'pre'}, filters=[LinkifyFilter]) >>> cleaner.clean('
http://example.com
') '
http://example.com
' @@ -429,8 +429,8 @@ And passing parameters to ``LinkifyFilter``: >>> from bleach.linkifier import LinkifyFilter >>> cleaner = Cleaner( - ... tags=['pre'], - ... filters=[partial(LinkifyFilter, skip_tags=['pre'])] + ... tags={'pre'}, + ... filters=[partial(LinkifyFilter, skip_tags={'pre'})] ... ) ... >>> cleaner.clean('
http://example.com
') From 3085abc6746951224a594493f5b261932b36beae Mon Sep 17 00:00:00 2001 From: Will Kahn-Greene Date: Sat, 21 Jan 2023 18:09:54 -0500 Subject: [PATCH 2/5] Fix test data to pass sets instead of lists This fixes the test data to pass sets instead of lists for "tags", "skip_tags", "recognized_tags", and "protocols". --- bleach/linkifier.py | 3 +- tests/test_clean.py | 94 +++++++++++++++++++++---------------------- tests/test_css.py | 4 +- tests/test_linkify.py | 32 +++++++-------- 4 files changed, 63 insertions(+), 70 deletions(-) diff --git a/bleach/linkifier.py b/bleach/linkifier.py index 183eb5ba..b5a3041e 100644 --- a/bleach/linkifier.py +++ b/bleach/linkifier.py @@ -122,7 +122,8 @@ def __init__( :arg set skip_tags: set of tags that you don't want to linkify the contents of; for example, you could set this to ``{'pre'}`` to skip - linkifying contents of ``pre`` tags + linkifying contents of ``pre`` tags; ``None`` means you don't + want linkify to skip any tags :arg bool parse_email: whether or not to linkify email addresses diff --git a/tests/test_clean.py b/tests/test_clean.py index 10a91fd0..73946a1f 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -29,7 +29,7 @@ def test_clean_idempotent(data): def test_clean_idempotent_img(): - tags = ["img"] + tags = {"img"} dirty = '' assert clean(clean(dirty, tags=tags), tags=tags) == clean(dirty, tags=tags) @@ -254,21 +254,21 @@ def test_character_entities_handling(text, expected): # a tag is disallowed, so it's stripped ( '

link text

', - {"tags": ["p"]}, + {"tags": {"p"}}, "

link text

", ), # Test nested disallowed tag ( "

multiply nested text

", - {"tags": ["p"]}, + {"tags": {"p"}}, "

multiply nested text

", ), # (#271) - ("