From ce8ae68aba7ff040a4ab03b605b8cb21c1ca26a6 Mon Sep 17 00:00:00 2001 From: Denis Ledoux Date: Thu, 28 Mar 2024 13:06:15 +0100 Subject: [PATCH] FIX: Allow multiple inlined image data links in html clean Add a lazy quantifier in the regex `_find_image_dataurls` to match as few characters as possible, to make it stop at the first occurence of `;base64,` e.g. ```py >>> _find_image_dataurls = re.compile(r'data:image/(.+);base64,', re.I).findall >>> _find_image_dataurls('
') ['jpeg;base64,foo); background-image: url(data:image/jpeg'] ``` ```py >>> _find_image_dataurls = re.compile(r'data:image/(.+?);base64,', re.I).findall >>> _find_image_dataurls('
') ['jpeg', 'jpeg'] ``` This allows to have multiple image data links on the same line, which happens for instance in inline styles. Without this change, `_has_javascript_scheme` returns `True` because the count of safe image urls is lower than the number of possible malicious scheme. Then, the whole style is dropped as considered malicious. Co-authored-by: Christophe Simonis --- lxml_html_clean/clean.py | 2 +- tests/test_clean.py | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/lxml_html_clean/clean.py b/lxml_html_clean/clean.py index fdc96ab..a62b710 100644 --- a/lxml_html_clean/clean.py +++ b/lxml_html_clean/clean.py @@ -54,7 +54,7 @@ # All kinds of schemes besides just javascript: that can cause # execution: _find_image_dataurls = re.compile( - r'data:image/(.+);base64,', re.I).findall + r'data:image/(.+?);base64,', re.I).findall _possibly_malicious_schemes = re.compile( r'(javascript|jscript|livescript|vbscript|data|about|mocha):', re.I).findall diff --git a/tests/test_clean.py b/tests/test_clean.py index 2ec492d..9bd63eb 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -255,6 +255,31 @@ def test_image_data_links_in_style(self): cleaned, "%s -> %s" % (url, cleaned)) + def test_image_data_links_in_inline_style(self): + safe_attrs = set(lxml.html.defs.safe_attrs) + safe_attrs.add('style') + + cleaner = Cleaner( + safe_attrs_only=True, + safe_attrs=safe_attrs) + + data = b'123' + data_b64 = base64.b64encode(data).decode('ASCII') + url = "url(data:image/jpeg;base64,%s)" % data_b64 + styles = [ + "background: %s" % url, + "background: %s; background-image: %s" % (url, url), + ] + for style in styles: + html = '
' % style + s = lxml.html.fragment_fromstring(html) + + cleaned = lxml.html.tostring(cleaner.clean_html(s)) + self.assertEqual( + html.encode("UTF-8"), + cleaned, + "%s -> %s" % (style, cleaned)) + def test_formaction_attribute_in_button_input(self): # The formaction attribute overrides the form's action and should be # treated as a malicious link attribute