From b8c933254bbaee626304a38d9d07c2f91247b532 Mon Sep 17 00:00:00 2001 From: Danny Peterson Date: Mon, 5 Feb 2024 14:28:21 -0500 Subject: [PATCH 1/7] removed bleach, replaced w nh3 --- hawc/apps/common/clean/__init__.py | 0 hawc/apps/common/clean/sanitize_css.py | 115 ++++++++++++++++++++++++ hawc/apps/common/clean/sanitize_html.py | 70 +++++++++++++++ hawc/apps/common/forms.py | 3 +- hawc/apps/common/validators.py | 62 ++++--------- hawc/apps/riskofbias/serializers.py | 5 +- hawc/apps/summary/forms.py | 5 +- hawc/apps/summary/serializers.py | 3 +- requirements/base.txt | 3 +- 9 files changed, 214 insertions(+), 52 deletions(-) create mode 100644 hawc/apps/common/clean/__init__.py create mode 100644 hawc/apps/common/clean/sanitize_css.py create mode 100644 hawc/apps/common/clean/sanitize_html.py diff --git a/hawc/apps/common/clean/__init__.py b/hawc/apps/common/clean/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/hawc/apps/common/clean/sanitize_css.py b/hawc/apps/common/clean/sanitize_css.py new file mode 100644 index 0000000000..9410917c28 --- /dev/null +++ b/hawc/apps/common/clean/sanitize_css.py @@ -0,0 +1,115 @@ +"""Sanitize CSS.""" + +import tinycss2 + +ALLOWED_CSS_PROPERTIES = frozenset( + ( + "azimuth", + "background-color", + "border-bottom-color", + "border-collapse", + "border-color", + "border-left-color", + "border-right-color", + "border-top-color", + "clear", + "color", + "cursor", + "direction", + "display", + "elevation", + "float", + "font", + "font-family", + "font-size", + "font-style", + "font-variant", + "font-weight", + "height", + "letter-spacing", + "line-height", + "overflow", + "pause", + "pause-after", + "pause-before", + "pitch", + "pitch-range", + "richness", + "speak", + "speak-header", + "speak-numeral", + "speak-punctuation", + "speech-rate", + "stress", + "text-align", + "text-decoration", + "text-indent", + "unicode-bidi", + "vertical-align", + "voice-family", + "volume", + "white-space", + "width", + ) +) + + +ALLOWED_SVG_PROPERTIES = frozenset( + ( + "fill", + "fill-opacity", + "fill-rule", + "stroke", + "stroke-width", + "stroke-linecap", + "stroke-linejoin", + "stroke-opacity", + ) +) + + +class CSSSanitizer: + """ + Santitize CSS elements. + Adapted from Bleach. + https://github.com/mozilla/bleach/blob/main/bleach/css_sanitizer.py + """ + + def __init__( + self, + allowed_css_properties=ALLOWED_CSS_PROPERTIES, + allowed_svg_properties=ALLOWED_SVG_PROPERTIES, + ): + """Add allowed properties.""" + self.allowed_css_properties = allowed_css_properties + self.allowed_svg_properties = allowed_svg_properties + + def sanitize_css(self, style): + """Sanitizes css in style tags.""" + parsed = tinycss2.parse_declaration_list(style) + + if not parsed: + return "" + + new_tokens = [] + for token in parsed: + if token.type == "declaration": + if ( + token.lower_name in self.allowed_css_properties + or token.lower_name in self.allowed_svg_properties + ): + new_tokens.append(token) + elif ( + token.type in ("comment", "whitespace") + and new_tokens + and new_tokens[-1].type != token.type + ): + new_tokens.append(token) + + # NOTE(willkg): We currently don't handle AtRule or ParseError and + # so both get silently thrown out + + if not new_tokens: + return "" + + return tinycss2.serialize(new_tokens).strip() diff --git a/hawc/apps/common/clean/sanitize_html.py b/hawc/apps/common/clean/sanitize_html.py new file mode 100644 index 0000000000..f39707fcc4 --- /dev/null +++ b/hawc/apps/common/clean/sanitize_html.py @@ -0,0 +1,70 @@ +"""Validate html.""" + +import nh3 +from django.utils.safestring import SafeText, mark_safe + +from .sanitize_css import CSSSanitizer + +valid_html_tags = { + "a", + "blockquote", + "br", + "div", + "em", + "h1", + "h2", + "h3", + "h4", + "h5", + "li", + "mark", + "ol", + "p", + "span", + "strong", + "sub", + "sup", + "s", + "ul", + "u", +} + +valid_html_attrs = { + "*": {"style"}, + "a": {"class", "href"}, + "span": {"class"}, + "mark": {"class"}, + "div": {"class"}, +} + +valid_css_properties = {"color", "background-color"} +valid_svg_properties = {} + +css_sanitizer = CSSSanitizer( + allowed_css_properties=valid_css_properties, + allowed_svg_properties=valid_svg_properties, +) + + +def clean_html(html: str) -> SafeText: + """Cleans given HTML by removing invalid HTML tags, attributes, and CSS properties. + Note: inner text within invalid HTML tags will still be included. + Args: + html (str): HTML to clean + Returns: + str: cleaned HTML + """ + + def attribute_filter(element, attribute, value): + """Send styles to CSS sanitizer.""" + if attribute == "style": + return css_sanitizer.sanitize_css(value) + return value + + sanitize_html = nh3.clean( + html, + tags=valid_html_tags, + attributes=valid_html_attrs, + attribute_filter=attribute_filter, + ) + return mark_safe(sanitize_html) diff --git a/hawc/apps/common/forms.py b/hawc/apps/common/forms.py index 638e536f55..3485fc522e 100644 --- a/hawc/apps/common/forms.py +++ b/hawc/apps/common/forms.py @@ -8,6 +8,7 @@ from django.urls import reverse from . import validators, widgets +from .clean import sanitize_html from .helper import PydanticToDjangoError ASSESSMENT_UNIQUE_MESSAGE = "Must be unique for assessment (current value already exists)." @@ -383,7 +384,7 @@ def __init__(self, *args, **kwargs): def to_python(self, value): value = super().to_python(value) - return validators.clean_html(value) if value else value + return sanitize_html.clean_html(value) if value else value def validate(self, value): super().validate(value) diff --git a/hawc/apps/common/validators.py b/hawc/apps/common/validators.py index 32734084f0..c48edcb904 100644 --- a/hawc/apps/common/validators.py +++ b/hawc/apps/common/validators.py @@ -3,8 +3,6 @@ from functools import partial from urllib import parse -import bleach -from bleach.css_sanitizer import CSSSanitizer from django.core.exceptions import ValidationError from django.core.validators import RegexValidator, URLValidator from django.utils.encoding import force_str @@ -14,6 +12,22 @@ tag_regex = re.compile(r"\w+)[^>]*>") hyperlink_regex = re.compile(r"href\s*=\s*['\"](.*?)['\"]") + +valid_scheme = {"", "http", "https"} +valid_netloc_endings = { + "canada.ca", + ".edu", + ".gov", + ".who.int", + "doi.org", + "elsevier.com", + "public.tableau.com", + "sciencedirect.com", + "sharepoint.com", + "hawcproject.org", + "zenodo.org", +} + valid_html_tags = { "a", "blockquote", @@ -26,6 +40,7 @@ "h4", "h5", "li", + "mark", "ol", "p", "span", @@ -36,49 +51,6 @@ "ul", "u", } -valid_html_attrs = { - "*": ["style"], - "a": ["href", "rel", "target"], - "span": ["class", "data-pk", "data-type"], - "div": ["class", "data-pk", "data-type"], -} -valid_css_properties = {"color", "background-color"} -valid_scheme = {"", "http", "https"} -valid_netloc_endings = { - "canada.ca", - ".edu", - ".gov", - ".who.int", - "doi.org", - "elsevier.com", - "public.tableau.com", - "sciencedirect.com", - "sharepoint.com", - "hawcproject.org", - "zenodo.org", -} - - -def clean_html(html: str) -> str: - """ - Cleans given HTML by removing invalid HTML tags, HTML properties, and CSS properties. - - Note: inner text within invalid HTML tags will still be included. - - Args: - html (str): HTML to clean - - Returns: - str: cleaned HTML - """ - css_sanitizer = CSSSanitizer(allowed_css_properties=valid_css_properties) - return bleach.clean( - html, - tags=valid_html_tags, - attributes=valid_html_attrs, - css_sanitizer=css_sanitizer, - strip=True, - ) def validate_html_tags(html: str, field: str | None = None) -> str: diff --git a/hawc/apps/riskofbias/serializers.py b/hawc/apps/riskofbias/serializers.py index 4499f19e55..b06d623c43 100644 --- a/hawc/apps/riskofbias/serializers.py +++ b/hawc/apps/riskofbias/serializers.py @@ -4,6 +4,7 @@ from rest_framework import serializers from ..common import validators +from ..common.clean import sanitize_html from ..common.helper import SerializerHelper, tryParseInt from ..myuser.models import HAWCUser from ..myuser.serializers import HAWCUserSerializer @@ -103,7 +104,7 @@ class Meta: def validate_notes(self, value): validators.validate_hyperlinks(value) - return validators.clean_html(value) + return sanitize_html.clean_html(value) class RiskOfBiasScoreSerializer(serializers.ModelSerializer): @@ -129,7 +130,7 @@ class Meta: def validate_notes(self, value): validators.validate_hyperlinks(value) - return validators.clean_html(value) + return sanitize_html.clean_html(value) class StudyScoreSerializer(RiskOfBiasScoreSerializer): diff --git a/hawc/apps/summary/forms.py b/hawc/apps/summary/forms.py index 20f2ca0a4e..ef8dcc3ce2 100644 --- a/hawc/apps/summary/forms.py +++ b/hawc/apps/summary/forms.py @@ -16,6 +16,7 @@ from ..assessment.models import DoseUnits from ..common import validators from ..common.autocomplete import AutocompleteChoiceField +from ..common.clean import sanitize_html from ..common.forms import ( BaseFormHelper, CopyForm, @@ -195,7 +196,7 @@ def clean_title(self): def clean_caption(self): caption = self.cleaned_data["caption"] validators.validate_hyperlinks(caption) - return validators.clean_html(caption) + return sanitize_html.clean_html(caption) def clean_evidence_type(self): visual_type = self.cleaned_data["visual_type"] @@ -679,7 +680,7 @@ def clean_title(self): def clean_caption(self): caption = self.cleaned_data["caption"] validators.validate_hyperlinks(caption) - return validators.clean_html(caption) + return sanitize_html.clean_html(caption) class DataPivotUploadForm(DataPivotForm): diff --git a/hawc/apps/summary/serializers.py b/hawc/apps/summary/serializers.py index 25de07f5c4..aad862f839 100644 --- a/hawc/apps/summary/serializers.py +++ b/hawc/apps/summary/serializers.py @@ -3,6 +3,7 @@ from rest_framework import serializers from ..common import validators +from ..common.clean import sanitize_html from ..common.helper import SerializerHelper from ..riskofbias.serializers import AssessmentRiskOfBiasSerializer from . import constants, models @@ -114,7 +115,7 @@ class Meta: def validate_text(self, value): validators.validate_hyperlinks(value) - return validators.clean_html(value) + return sanitize_html.clean_html(value) def validate(self, data): assessment = data["assessment"] diff --git a/requirements/base.txt b/requirements/base.txt index 989d83fbae..0542fcdee5 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -29,7 +29,8 @@ redis==5.0.1 requests==2.31.0 urllib3<2 pydantic==2.4.2 -bleach[css]==6.1.0 +nh3==0.2.15 +tinycss2==1.2.1 # computational numpy==1.26.1 From a5707cfca17ea6b33e78a8ae26a81e517b058d2e Mon Sep 17 00:00:00 2001 From: Danny Peterson Date: Mon, 5 Feb 2024 14:35:27 -0500 Subject: [PATCH 2/7] added test --- tests/hawc/apps/common/clean/__init__.py | 0 .../apps/common/clean/test_sanitize_html.py | 69 +++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 tests/hawc/apps/common/clean/__init__.py create mode 100644 tests/hawc/apps/common/clean/test_sanitize_html.py diff --git a/tests/hawc/apps/common/clean/__init__.py b/tests/hawc/apps/common/clean/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/hawc/apps/common/clean/test_sanitize_html.py b/tests/hawc/apps/common/clean/test_sanitize_html.py new file mode 100644 index 0000000000..84f0b9339a --- /dev/null +++ b/tests/hawc/apps/common/clean/test_sanitize_html.py @@ -0,0 +1,69 @@ +"""Test HTML cleanup.""" +from hawc.apps.common.clean.sanitize_html import clean_html + + +class TestEscapeAbstract: + def test_expected_html(self): + # no changes, no elements + text = clean_html("text without any elements") + assert text == "text without any elements" + + # no changes, expected elements + # + text = clean_html('turtles') + assert text == 'turtles' + + # + text = clean_html('BACKGROUND: ') + assert text == 'BACKGROUND: ' + + #
+ text = clean_html("health-care system.
") + assert text == "health-care system.
" + + def test_unexpected_html(self): + # elements will be broken + text = clean_html("before after ") + assert text == "before after " + + # tags that aren't real will be escaped + text = clean_html("before <10>after") + assert text == "before <10>after" + + def test_css(self): + text = clean_html( + 'This is a test!' + ) + assert text == 'This is a test!' + + def test_all(self): + text = clean_html( + """ + <10 + < 20 + + +

+ + + """ + ) + assert ( + text + == """ + <10 + < 20 + + +

+ + + """ + ) From e26fd9e6dbc8c0dfce22616f8b75739bc743157f Mon Sep 17 00:00:00 2001 From: Danny Peterson Date: Tue, 6 Feb 2024 09:09:15 -0500 Subject: [PATCH 3/7] modified tests --- tests/hawc/apps/common/test_forms.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/hawc/apps/common/test_forms.py b/tests/hawc/apps/common/test_forms.py index 909563fef2..668bf4aa3d 100644 --- a/tests/hawc/apps/common/test_forms.py +++ b/tests/hawc/apps/common/test_forms.py @@ -24,11 +24,11 @@ def test_to_python(self): fld = QuillField() data = [ # remove script tag - ("", "alert();"), + ("", ""), # remove attribute ( 'link', - 'link', + 'link', ), # remove some styles ( From d994eefb63739ffca708da163e7f97baf37fc23b Mon Sep 17 00:00:00 2001 From: Andy Shapiro Date: Thu, 8 Feb 2024 10:22:06 -0500 Subject: [PATCH 4/7] move content back to original position --- hawc/apps/common/validators.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/hawc/apps/common/validators.py b/hawc/apps/common/validators.py index c48edcb904..db1118bae5 100644 --- a/hawc/apps/common/validators.py +++ b/hawc/apps/common/validators.py @@ -13,21 +13,6 @@ hyperlink_regex = re.compile(r"href\s*=\s*['\"](.*?)['\"]") -valid_scheme = {"", "http", "https"} -valid_netloc_endings = { - "canada.ca", - ".edu", - ".gov", - ".who.int", - "doi.org", - "elsevier.com", - "public.tableau.com", - "sciencedirect.com", - "sharepoint.com", - "hawcproject.org", - "zenodo.org", -} - valid_html_tags = { "a", "blockquote", @@ -51,6 +36,20 @@ "ul", "u", } +valid_scheme = {"", "http", "https"} +valid_netloc_endings = { + "canada.ca", + ".edu", + ".gov", + ".who.int", + "doi.org", + "elsevier.com", + "public.tableau.com", + "sciencedirect.com", + "sharepoint.com", + "hawcproject.org", + "zenodo.org", +} def validate_html_tags(html: str, field: str | None = None) -> str: From b42dc73a003bad66f8d22502e6f21db8427f9a2b Mon Sep 17 00:00:00 2001 From: Andy Shapiro Date: Thu, 8 Feb 2024 11:03:24 -0500 Subject: [PATCH 5/7] remove repeated declaration --- hawc/apps/common/validators.py | 27 ++------------------------- 1 file changed, 2 insertions(+), 25 deletions(-) diff --git a/hawc/apps/common/validators.py b/hawc/apps/common/validators.py index db1118bae5..6075a6dcaf 100644 --- a/hawc/apps/common/validators.py +++ b/hawc/apps/common/validators.py @@ -9,33 +9,10 @@ from pydantic import BaseModel from pydantic import ValidationError as PydanticValidationError +from .clean.sanitize_html import valid_html_tags + tag_regex = re.compile(r"\w+)[^>]*>") hyperlink_regex = re.compile(r"href\s*=\s*['\"](.*?)['\"]") - - -valid_html_tags = { - "a", - "blockquote", - "br", - "div", - "em", - "h1", - "h2", - "h3", - "h4", - "h5", - "li", - "mark", - "ol", - "p", - "span", - "strong", - "sub", - "sup", - "s", - "ul", - "u", -} valid_scheme = {"", "http", "https"} valid_netloc_endings = { "canada.ca", From 7ac2874df3ef9910f6e848058043505d36aa3ada Mon Sep 17 00:00:00 2001 From: Andy Shapiro Date: Thu, 8 Feb 2024 11:08:29 -0500 Subject: [PATCH 6/7] remove django type casting --- hawc/apps/common/clean/sanitize_html.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/hawc/apps/common/clean/sanitize_html.py b/hawc/apps/common/clean/sanitize_html.py index f39707fcc4..e62f7efabf 100644 --- a/hawc/apps/common/clean/sanitize_html.py +++ b/hawc/apps/common/clean/sanitize_html.py @@ -1,7 +1,6 @@ """Validate html.""" import nh3 -from django.utils.safestring import SafeText, mark_safe from .sanitize_css import CSSSanitizer @@ -46,7 +45,7 @@ ) -def clean_html(html: str) -> SafeText: +def clean_html(html: str) -> str: """Cleans given HTML by removing invalid HTML tags, attributes, and CSS properties. Note: inner text within invalid HTML tags will still be included. Args: @@ -61,10 +60,9 @@ def attribute_filter(element, attribute, value): return css_sanitizer.sanitize_css(value) return value - sanitize_html = nh3.clean( + return nh3.clean( html, tags=valid_html_tags, attributes=valid_html_attrs, attribute_filter=attribute_filter, ) - return mark_safe(sanitize_html) From 00a8d8bdb013362d5c6b362844857ef9d1297c4e Mon Sep 17 00:00:00 2001 From: Andy Shapiro Date: Thu, 8 Feb 2024 11:16:17 -0500 Subject: [PATCH 7/7] add a few more tests --- .../apps/common/clean/test_sanitize_html.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/tests/hawc/apps/common/clean/test_sanitize_html.py b/tests/hawc/apps/common/clean/test_sanitize_html.py index 84f0b9339a..4d75bb5375 100644 --- a/tests/hawc/apps/common/clean/test_sanitize_html.py +++ b/tests/hawc/apps/common/clean/test_sanitize_html.py @@ -10,17 +10,26 @@ def test_expected_html(self): # no changes, expected elements # - text = clean_html('turtles') - assert text == 'turtles' + text = clean_html('turtles') + assert text == 'turtles' # - text = clean_html('BACKGROUND: ') - assert text == 'BACKGROUND: ' + text = clean_html('BACKGROUND: ') + assert text == 'BACKGROUND: ' #
text = clean_html("health-care system.
") assert text == "health-care system.
" + # check < still works if spaced correctly + for a, b in ( + ("10 < 20", "10 < 20"), + ("10 <= 20", "10 <= 20"), + ("30 > 20", "30 > 20"), + ("30 >= 20", "30 >= 20"), + ): + assert clean_html(a) == b + def test_unexpected_html(self): #