diff --git a/hawc/apps/common/clean/__init__.py b/hawc/apps/common/clean/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/hawc/apps/common/clean/sanitize_css.py b/hawc/apps/common/clean/sanitize_css.py new file mode 100644 index 0000000000..9410917c28 --- /dev/null +++ b/hawc/apps/common/clean/sanitize_css.py @@ -0,0 +1,115 @@ +"""Sanitize CSS.""" + +import tinycss2 + +ALLOWED_CSS_PROPERTIES = frozenset( + ( + "azimuth", + "background-color", + "border-bottom-color", + "border-collapse", + "border-color", + "border-left-color", + "border-right-color", + "border-top-color", + "clear", + "color", + "cursor", + "direction", + "display", + "elevation", + "float", + "font", + "font-family", + "font-size", + "font-style", + "font-variant", + "font-weight", + "height", + "letter-spacing", + "line-height", + "overflow", + "pause", + "pause-after", + "pause-before", + "pitch", + "pitch-range", + "richness", + "speak", + "speak-header", + "speak-numeral", + "speak-punctuation", + "speech-rate", + "stress", + "text-align", + "text-decoration", + "text-indent", + "unicode-bidi", + "vertical-align", + "voice-family", + "volume", + "white-space", + "width", + ) +) + + +ALLOWED_SVG_PROPERTIES = frozenset( + ( + "fill", + "fill-opacity", + "fill-rule", + "stroke", + "stroke-width", + "stroke-linecap", + "stroke-linejoin", + "stroke-opacity", + ) +) + + +class CSSSanitizer: + """ + Santitize CSS elements. + Adapted from Bleach. + https://github.com/mozilla/bleach/blob/main/bleach/css_sanitizer.py + """ + + def __init__( + self, + allowed_css_properties=ALLOWED_CSS_PROPERTIES, + allowed_svg_properties=ALLOWED_SVG_PROPERTIES, + ): + """Add allowed properties.""" + self.allowed_css_properties = allowed_css_properties + self.allowed_svg_properties = allowed_svg_properties + + def sanitize_css(self, style): + """Sanitizes css in style tags.""" + parsed = tinycss2.parse_declaration_list(style) + + if not parsed: + return "" + + new_tokens = [] + for token in parsed: + if token.type == "declaration": + if ( + token.lower_name in self.allowed_css_properties + or token.lower_name in self.allowed_svg_properties + ): + new_tokens.append(token) + elif ( + token.type in ("comment", "whitespace") + and new_tokens + and new_tokens[-1].type != token.type + ): + new_tokens.append(token) + + # NOTE(willkg): We currently don't handle AtRule or ParseError and + # so both get silently thrown out + + if not new_tokens: + return "" + + return tinycss2.serialize(new_tokens).strip() diff --git a/hawc/apps/common/clean/sanitize_html.py b/hawc/apps/common/clean/sanitize_html.py new file mode 100644 index 0000000000..e62f7efabf --- /dev/null +++ b/hawc/apps/common/clean/sanitize_html.py @@ -0,0 +1,68 @@ +"""Validate html.""" + +import nh3 + +from .sanitize_css import CSSSanitizer + +valid_html_tags = { + "a", + "blockquote", + "br", + "div", + "em", + "h1", + "h2", + "h3", + "h4", + "h5", + "li", + "mark", + "ol", + "p", + "span", + "strong", + "sub", + "sup", + "s", + "ul", + "u", +} + +valid_html_attrs = { + "*": {"style"}, + "a": {"class", "href"}, + "span": {"class"}, + "mark": {"class"}, + "div": {"class"}, +} + +valid_css_properties = {"color", "background-color"} +valid_svg_properties = {} + +css_sanitizer = CSSSanitizer( + allowed_css_properties=valid_css_properties, + allowed_svg_properties=valid_svg_properties, +) + + +def clean_html(html: str) -> str: + """Cleans given HTML by removing invalid HTML tags, attributes, and CSS properties. + Note: inner text within invalid HTML tags will still be included. + Args: + html (str): HTML to clean + Returns: + str: cleaned HTML + """ + + def attribute_filter(element, attribute, value): + """Send styles to CSS sanitizer.""" + if attribute == "style": + return css_sanitizer.sanitize_css(value) + return value + + return nh3.clean( + html, + tags=valid_html_tags, + attributes=valid_html_attrs, + attribute_filter=attribute_filter, + ) diff --git a/hawc/apps/common/forms.py b/hawc/apps/common/forms.py index 638e536f55..3485fc522e 100644 --- a/hawc/apps/common/forms.py +++ b/hawc/apps/common/forms.py @@ -8,6 +8,7 @@ from django.urls import reverse from . import validators, widgets +from .clean import sanitize_html from .helper import PydanticToDjangoError ASSESSMENT_UNIQUE_MESSAGE = "Must be unique for assessment (current value already exists)." @@ -383,7 +384,7 @@ def __init__(self, *args, **kwargs): def to_python(self, value): value = super().to_python(value) - return validators.clean_html(value) if value else value + return sanitize_html.clean_html(value) if value else value def validate(self, value): super().validate(value) diff --git a/hawc/apps/common/validators.py b/hawc/apps/common/validators.py index 32734084f0..6075a6dcaf 100644 --- a/hawc/apps/common/validators.py +++ b/hawc/apps/common/validators.py @@ -3,46 +3,16 @@ from functools import partial from urllib import parse -import bleach -from bleach.css_sanitizer import CSSSanitizer from django.core.exceptions import ValidationError from django.core.validators import RegexValidator, URLValidator from django.utils.encoding import force_str from pydantic import BaseModel from pydantic import ValidationError as PydanticValidationError +from .clean.sanitize_html import valid_html_tags + tag_regex = re.compile(r"\w+)[^>]*>") hyperlink_regex = re.compile(r"href\s*=\s*['\"](.*?)['\"]") - -valid_html_tags = { - "a", - "blockquote", - "br", - "div", - "em", - "h1", - "h2", - "h3", - "h4", - "h5", - "li", - "ol", - "p", - "span", - "strong", - "sub", - "sup", - "s", - "ul", - "u", -} -valid_html_attrs = { - "*": ["style"], - "a": ["href", "rel", "target"], - "span": ["class", "data-pk", "data-type"], - "div": ["class", "data-pk", "data-type"], -} -valid_css_properties = {"color", "background-color"} valid_scheme = {"", "http", "https"} valid_netloc_endings = { "canada.ca", @@ -59,28 +29,6 @@ } -def clean_html(html: str) -> str: - """ - Cleans given HTML by removing invalid HTML tags, HTML properties, and CSS properties. - - Note: inner text within invalid HTML tags will still be included. - - Args: - html (str): HTML to clean - - Returns: - str: cleaned HTML - """ - css_sanitizer = CSSSanitizer(allowed_css_properties=valid_css_properties) - return bleach.clean( - html, - tags=valid_html_tags, - attributes=valid_html_attrs, - css_sanitizer=css_sanitizer, - strip=True, - ) - - def validate_html_tags(html: str, field: str | None = None) -> str: """Html contains a subset of acceptable tags. diff --git a/hawc/apps/riskofbias/serializers.py b/hawc/apps/riskofbias/serializers.py index 4499f19e55..b06d623c43 100644 --- a/hawc/apps/riskofbias/serializers.py +++ b/hawc/apps/riskofbias/serializers.py @@ -4,6 +4,7 @@ from rest_framework import serializers from ..common import validators +from ..common.clean import sanitize_html from ..common.helper import SerializerHelper, tryParseInt from ..myuser.models import HAWCUser from ..myuser.serializers import HAWCUserSerializer @@ -103,7 +104,7 @@ class Meta: def validate_notes(self, value): validators.validate_hyperlinks(value) - return validators.clean_html(value) + return sanitize_html.clean_html(value) class RiskOfBiasScoreSerializer(serializers.ModelSerializer): @@ -129,7 +130,7 @@ class Meta: def validate_notes(self, value): validators.validate_hyperlinks(value) - return validators.clean_html(value) + return sanitize_html.clean_html(value) class StudyScoreSerializer(RiskOfBiasScoreSerializer): diff --git a/hawc/apps/summary/forms.py b/hawc/apps/summary/forms.py index 20f2ca0a4e..ef8dcc3ce2 100644 --- a/hawc/apps/summary/forms.py +++ b/hawc/apps/summary/forms.py @@ -16,6 +16,7 @@ from ..assessment.models import DoseUnits from ..common import validators from ..common.autocomplete import AutocompleteChoiceField +from ..common.clean import sanitize_html from ..common.forms import ( BaseFormHelper, CopyForm, @@ -195,7 +196,7 @@ def clean_title(self): def clean_caption(self): caption = self.cleaned_data["caption"] validators.validate_hyperlinks(caption) - return validators.clean_html(caption) + return sanitize_html.clean_html(caption) def clean_evidence_type(self): visual_type = self.cleaned_data["visual_type"] @@ -679,7 +680,7 @@ def clean_title(self): def clean_caption(self): caption = self.cleaned_data["caption"] validators.validate_hyperlinks(caption) - return validators.clean_html(caption) + return sanitize_html.clean_html(caption) class DataPivotUploadForm(DataPivotForm): diff --git a/hawc/apps/summary/serializers.py b/hawc/apps/summary/serializers.py index 25de07f5c4..aad862f839 100644 --- a/hawc/apps/summary/serializers.py +++ b/hawc/apps/summary/serializers.py @@ -3,6 +3,7 @@ from rest_framework import serializers from ..common import validators +from ..common.clean import sanitize_html from ..common.helper import SerializerHelper from ..riskofbias.serializers import AssessmentRiskOfBiasSerializer from . import constants, models @@ -114,7 +115,7 @@ class Meta: def validate_text(self, value): validators.validate_hyperlinks(value) - return validators.clean_html(value) + return sanitize_html.clean_html(value) def validate(self, data): assessment = data["assessment"] diff --git a/requirements/base.txt b/requirements/base.txt index e36762984c..96c7208f67 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -29,7 +29,8 @@ redis==5.0.1 requests==2.31.0 urllib3<2 pydantic==2.4.2 -bleach[css]==6.1.0 +nh3==0.2.15 +tinycss2==1.2.1 # computational numpy==1.26.1 diff --git a/tests/hawc/apps/common/clean/__init__.py b/tests/hawc/apps/common/clean/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/hawc/apps/common/clean/test_sanitize_html.py b/tests/hawc/apps/common/clean/test_sanitize_html.py new file mode 100644 index 0000000000..4d75bb5375 --- /dev/null +++ b/tests/hawc/apps/common/clean/test_sanitize_html.py @@ -0,0 +1,78 @@ +"""Test HTML cleanup.""" +from hawc.apps.common.clean.sanitize_html import clean_html + + +class TestEscapeAbstract: + def test_expected_html(self): + # no changes, no elements + text = clean_html("text without any elements") + assert text == "text without any elements" + + # no changes, expected elements + # + text = clean_html('turtles') + assert text == 'turtles' + + # + text = clean_html('BACKGROUND: ') + assert text == 'BACKGROUND: ' + + #
+ text = clean_html("health-care system.
") + assert text == "health-care system.
" + + # check < still works if spaced correctly + for a, b in ( + ("10 < 20", "10 < 20"), + ("10 <= 20", "10 <= 20"), + ("30 > 20", "30 > 20"), + ("30 >= 20", "30 >= 20"), + ): + assert clean_html(a) == b + + def test_unexpected_html(self): + # elements will be broken + text = clean_html("before after ") + assert text == "before after " + + # tags that aren't real will be escaped + text = clean_html("before <10>after") + assert text == "before <10>after" + + def test_css(self): + text = clean_html( + 'This is a test!' + ) + assert text == 'This is a test!' + + def test_all(self): + text = clean_html( + """ + <10 + < 20 + + +

+ + + """ + ) + assert ( + text + == """ + <10 + < 20 + + +

+ + + """ + ) diff --git a/tests/hawc/apps/common/test_forms.py b/tests/hawc/apps/common/test_forms.py index 909563fef2..668bf4aa3d 100644 --- a/tests/hawc/apps/common/test_forms.py +++ b/tests/hawc/apps/common/test_forms.py @@ -24,11 +24,11 @@ def test_to_python(self): fld = QuillField() data = [ # remove script tag - ("", "alert();"), + ("", ""), # remove attribute ( 'link', - 'link', + 'link', ), # remove some styles (