From ce8ae68aba7ff040a4ab03b605b8cb21c1ca26a6 Mon Sep 17 00:00:00 2001
From: Denis Ledoux <dle@odoo.com>
Date: Thu, 28 Mar 2024 13:06:15 +0100
Subject: [PATCH] FIX: Allow multiple inlined image data links in html clean

Add a lazy quantifier in the regex `_find_image_dataurls`
to match as few characters as possible,
to make it stop at the first occurence of `;base64,`

e.g.
```py
>>> _find_image_dataurls = re.compile(r'data:image/(.+);base64,', re.I).findall
>>> _find_image_dataurls('<div style="background: url(data:image/jpeg;base64,foo); background-image: url(data:image/jpeg;base64,foo);"></div>')
['jpeg;base64,foo); background-image: url(data:image/jpeg']
```

```py
>>> _find_image_dataurls = re.compile(r'data:image/(.+?);base64,', re.I).findall
>>> _find_image_dataurls('<div style="background: url(data:image/jpeg;base64,foo); background-image: url(data:image/jpeg;base64,foo);"></div>')
['jpeg', 'jpeg']
```

This allows to have multiple image data links on the same line,
which happens for instance in inline styles.

Without this change, `_has_javascript_scheme` returns `True`
because the count of safe image urls is lower than the number of
possible malicious scheme.
Then, the whole style is dropped as considered malicious.

Co-authored-by: Christophe Simonis <chs@odoo.com>
---
 lxml_html_clean/clean.py |  2 +-
 tests/test_clean.py      | 25 +++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)
diff --git a/lxml_html_clean/clean.py b/lxml_html_clean/clean.py
index fdc96ab..a62b710 100644
--- a/lxml_html_clean/clean.py
+++ b/lxml_html_clean/clean.py
@@ -54,7 +54,7 @@
 # All kinds of schemes besides just javascript: that can cause
 # execution:
 _find_image_dataurls = re.compile(
-    r'data:image/(.+);base64,', re.I).findall
+    r'data:image/(.+?);base64,', re.I).findall
 _possibly_malicious_schemes = re.compile(
     r'(javascript|jscript|livescript|vbscript|data|about|mocha):',
     re.I).findall
diff --git a/tests/test_clean.py b/tests/test_clean.py
index 2ec492d..9bd63eb 100644
--- a/tests/test_clean.py
+++ b/tests/test_clean.py
@@ -255,6 +255,31 @@ def test_image_data_links_in_style(self):
                 cleaned,
                 "%s  ->  %s" % (url, cleaned))
 
+    def test_image_data_links_in_inline_style(self):
+        safe_attrs = set(lxml.html.defs.safe_attrs)
+        safe_attrs.add('style')
+
+        cleaner = Cleaner(
+            safe_attrs_only=True,
+            safe_attrs=safe_attrs)
+
+        data = b'123'
+        data_b64 = base64.b64encode(data).decode('ASCII')
+        url = "url(data:image/jpeg;base64,%s)" % data_b64
+        styles = [
+            "background: %s" % url,
+            "background: %s; background-image: %s" % (url, url),
+        ]
+        for style in styles:
+            html = '<div style="%s"></div>' % style
+            s = lxml.html.fragment_fromstring(html)
+
+            cleaned = lxml.html.tostring(cleaner.clean_html(s))
+            self.assertEqual(
+                html.encode("UTF-8"),
+                cleaned,
+                "%s  ->  %s" % (style, cleaned))
+
     def test_formaction_attribute_in_button_input(self):
         # The formaction attribute overrides the form's action and should be
         # treated as a malicious link attribute