Skip to content

Commit

Permalink
Replace bleach with nh3 (#984)
Browse files Browse the repository at this point in the history
* removed bleach, replaced w nh3

* added test

* modified tests

* move content back to original position

* remove repeated declaration

* remove django type casting

* add a few more tests

---------

Co-authored-by: Andy Shapiro <[email protected]>
  • Loading branch information
dannypeterson and shapiromatron authored Feb 8, 2024
1 parent cc45b0d commit 59b6c6b
Show file tree
Hide file tree
Showing 12 changed files with 277 additions and 63 deletions.
Empty file.
115 changes: 115 additions & 0 deletions hawc/apps/common/clean/sanitize_css.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
"""Sanitize CSS."""

import tinycss2

ALLOWED_CSS_PROPERTIES = frozenset(
(
"azimuth",
"background-color",
"border-bottom-color",
"border-collapse",
"border-color",
"border-left-color",
"border-right-color",
"border-top-color",
"clear",
"color",
"cursor",
"direction",
"display",
"elevation",
"float",
"font",
"font-family",
"font-size",
"font-style",
"font-variant",
"font-weight",
"height",
"letter-spacing",
"line-height",
"overflow",
"pause",
"pause-after",
"pause-before",
"pitch",
"pitch-range",
"richness",
"speak",
"speak-header",
"speak-numeral",
"speak-punctuation",
"speech-rate",
"stress",
"text-align",
"text-decoration",
"text-indent",
"unicode-bidi",
"vertical-align",
"voice-family",
"volume",
"white-space",
"width",
)
)


ALLOWED_SVG_PROPERTIES = frozenset(
(
"fill",
"fill-opacity",
"fill-rule",
"stroke",
"stroke-width",
"stroke-linecap",
"stroke-linejoin",
"stroke-opacity",
)
)


class CSSSanitizer:
"""
Santitize CSS elements.
Adapted from Bleach.
https://github.com/mozilla/bleach/blob/main/bleach/css_sanitizer.py
"""

def __init__(
self,
allowed_css_properties=ALLOWED_CSS_PROPERTIES,
allowed_svg_properties=ALLOWED_SVG_PROPERTIES,
):
"""Add allowed properties."""
self.allowed_css_properties = allowed_css_properties
self.allowed_svg_properties = allowed_svg_properties

def sanitize_css(self, style):
"""Sanitizes css in style tags."""
parsed = tinycss2.parse_declaration_list(style)

if not parsed:
return ""

new_tokens = []
for token in parsed:
if token.type == "declaration":
if (
token.lower_name in self.allowed_css_properties
or token.lower_name in self.allowed_svg_properties
):
new_tokens.append(token)
elif (
token.type in ("comment", "whitespace")
and new_tokens
and new_tokens[-1].type != token.type
):
new_tokens.append(token)

# NOTE(willkg): We currently don't handle AtRule or ParseError and
# so both get silently thrown out

if not new_tokens:
return ""

return tinycss2.serialize(new_tokens).strip()
68 changes: 68 additions & 0 deletions hawc/apps/common/clean/sanitize_html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""Validate html."""

import nh3

from .sanitize_css import CSSSanitizer

valid_html_tags = {
"a",
"blockquote",
"br",
"div",
"em",
"h1",
"h2",
"h3",
"h4",
"h5",
"li",
"mark",
"ol",
"p",
"span",
"strong",
"sub",
"sup",
"s",
"ul",
"u",
}

valid_html_attrs = {
"*": {"style"},
"a": {"class", "href"},
"span": {"class"},
"mark": {"class"},
"div": {"class"},
}

valid_css_properties = {"color", "background-color"}
valid_svg_properties = {}

css_sanitizer = CSSSanitizer(
allowed_css_properties=valid_css_properties,
allowed_svg_properties=valid_svg_properties,
)


def clean_html(html: str) -> str:
"""Cleans given HTML by removing invalid HTML tags, attributes, and CSS properties.
Note: inner text within invalid HTML tags will still be included.
Args:
html (str): HTML to clean
Returns:
str: cleaned HTML
"""

def attribute_filter(element, attribute, value):
"""Send styles to CSS sanitizer."""
if attribute == "style":
return css_sanitizer.sanitize_css(value)
return value

return nh3.clean(
html,
tags=valid_html_tags,
attributes=valid_html_attrs,
attribute_filter=attribute_filter,
)
3 changes: 2 additions & 1 deletion hawc/apps/common/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from django.urls import reverse

from . import validators, widgets
from .clean import sanitize_html
from .helper import PydanticToDjangoError

ASSESSMENT_UNIQUE_MESSAGE = "Must be unique for assessment (current value already exists)."
Expand Down Expand Up @@ -383,7 +384,7 @@ def __init__(self, *args, **kwargs):

def to_python(self, value):
value = super().to_python(value)
return validators.clean_html(value) if value else value
return sanitize_html.clean_html(value) if value else value

def validate(self, value):
super().validate(value)
Expand Down
56 changes: 2 additions & 54 deletions hawc/apps/common/validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,46 +3,16 @@
from functools import partial
from urllib import parse

import bleach
from bleach.css_sanitizer import CSSSanitizer
from django.core.exceptions import ValidationError
from django.core.validators import RegexValidator, URLValidator
from django.utils.encoding import force_str
from pydantic import BaseModel
from pydantic import ValidationError as PydanticValidationError

from .clean.sanitize_html import valid_html_tags

tag_regex = re.compile(r"</?(?P<tag>\w+)[^>]*>")
hyperlink_regex = re.compile(r"href\s*=\s*['\"](.*?)['\"]")

valid_html_tags = {
"a",
"blockquote",
"br",
"div",
"em",
"h1",
"h2",
"h3",
"h4",
"h5",
"li",
"ol",
"p",
"span",
"strong",
"sub",
"sup",
"s",
"ul",
"u",
}
valid_html_attrs = {
"*": ["style"],
"a": ["href", "rel", "target"],
"span": ["class", "data-pk", "data-type"],
"div": ["class", "data-pk", "data-type"],
}
valid_css_properties = {"color", "background-color"}
valid_scheme = {"", "http", "https"}
valid_netloc_endings = {
"canada.ca",
Expand All @@ -59,28 +29,6 @@
}


def clean_html(html: str) -> str:
"""
Cleans given HTML by removing invalid HTML tags, HTML properties, and CSS properties.
Note: inner text within invalid HTML tags will still be included.
Args:
html (str): HTML to clean
Returns:
str: cleaned HTML
"""
css_sanitizer = CSSSanitizer(allowed_css_properties=valid_css_properties)
return bleach.clean(
html,
tags=valid_html_tags,
attributes=valid_html_attrs,
css_sanitizer=css_sanitizer,
strip=True,
)


def validate_html_tags(html: str, field: str | None = None) -> str:
"""Html contains a subset of acceptable tags.
Expand Down
5 changes: 3 additions & 2 deletions hawc/apps/riskofbias/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from rest_framework import serializers

from ..common import validators
from ..common.clean import sanitize_html
from ..common.helper import SerializerHelper, tryParseInt
from ..myuser.models import HAWCUser
from ..myuser.serializers import HAWCUserSerializer
Expand Down Expand Up @@ -103,7 +104,7 @@ class Meta:

def validate_notes(self, value):
validators.validate_hyperlinks(value)
return validators.clean_html(value)
return sanitize_html.clean_html(value)


class RiskOfBiasScoreSerializer(serializers.ModelSerializer):
Expand All @@ -129,7 +130,7 @@ class Meta:

def validate_notes(self, value):
validators.validate_hyperlinks(value)
return validators.clean_html(value)
return sanitize_html.clean_html(value)


class StudyScoreSerializer(RiskOfBiasScoreSerializer):
Expand Down
5 changes: 3 additions & 2 deletions hawc/apps/summary/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from ..assessment.models import DoseUnits
from ..common import validators
from ..common.autocomplete import AutocompleteChoiceField
from ..common.clean import sanitize_html
from ..common.forms import (
BaseFormHelper,
CopyForm,
Expand Down Expand Up @@ -195,7 +196,7 @@ def clean_title(self):
def clean_caption(self):
caption = self.cleaned_data["caption"]
validators.validate_hyperlinks(caption)
return validators.clean_html(caption)
return sanitize_html.clean_html(caption)

def clean_evidence_type(self):
visual_type = self.cleaned_data["visual_type"]
Expand Down Expand Up @@ -679,7 +680,7 @@ def clean_title(self):
def clean_caption(self):
caption = self.cleaned_data["caption"]
validators.validate_hyperlinks(caption)
return validators.clean_html(caption)
return sanitize_html.clean_html(caption)


class DataPivotUploadForm(DataPivotForm):
Expand Down
3 changes: 2 additions & 1 deletion hawc/apps/summary/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from rest_framework import serializers

from ..common import validators
from ..common.clean import sanitize_html
from ..common.helper import SerializerHelper
from ..riskofbias.serializers import AssessmentRiskOfBiasSerializer
from . import constants, models
Expand Down Expand Up @@ -114,7 +115,7 @@ class Meta:

def validate_text(self, value):
validators.validate_hyperlinks(value)
return validators.clean_html(value)
return sanitize_html.clean_html(value)

def validate(self, data):
assessment = data["assessment"]
Expand Down
3 changes: 2 additions & 1 deletion requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ redis==5.0.1
requests==2.31.0
urllib3<2
pydantic==2.4.2
bleach[css]==6.1.0
nh3==0.2.15
tinycss2==1.2.1

# computational
numpy==1.26.1
Expand Down
Empty file.
Loading

0 comments on commit 59b6c6b

Please sign in to comment.