Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace bleach with nh3 #984

Merged
merged 9 commits into from
Feb 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
115 changes: 115 additions & 0 deletions hawc/apps/common/clean/sanitize_css.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
"""Sanitize CSS."""

import tinycss2

ALLOWED_CSS_PROPERTIES = frozenset(
(
"azimuth",
"background-color",
"border-bottom-color",
"border-collapse",
"border-color",
"border-left-color",
"border-right-color",
"border-top-color",
"clear",
"color",
"cursor",
"direction",
"display",
"elevation",
"float",
"font",
"font-family",
"font-size",
"font-style",
"font-variant",
"font-weight",
"height",
"letter-spacing",
"line-height",
"overflow",
"pause",
"pause-after",
"pause-before",
"pitch",
"pitch-range",
"richness",
"speak",
"speak-header",
"speak-numeral",
"speak-punctuation",
"speech-rate",
"stress",
"text-align",
"text-decoration",
"text-indent",
"unicode-bidi",
"vertical-align",
"voice-family",
"volume",
"white-space",
"width",
)
)


ALLOWED_SVG_PROPERTIES = frozenset(
(
"fill",
"fill-opacity",
"fill-rule",
"stroke",
"stroke-width",
"stroke-linecap",
"stroke-linejoin",
"stroke-opacity",
)
)


class CSSSanitizer:
"""
Santitize CSS elements.
Adapted from Bleach.
https://github.com/mozilla/bleach/blob/main/bleach/css_sanitizer.py
"""

def __init__(
self,
allowed_css_properties=ALLOWED_CSS_PROPERTIES,
allowed_svg_properties=ALLOWED_SVG_PROPERTIES,
):
"""Add allowed properties."""
self.allowed_css_properties = allowed_css_properties
self.allowed_svg_properties = allowed_svg_properties

def sanitize_css(self, style):
"""Sanitizes css in style tags."""
parsed = tinycss2.parse_declaration_list(style)

if not parsed:
return ""

new_tokens = []
for token in parsed:
if token.type == "declaration":
if (
token.lower_name in self.allowed_css_properties
or token.lower_name in self.allowed_svg_properties
):
new_tokens.append(token)
elif (
token.type in ("comment", "whitespace")
and new_tokens
and new_tokens[-1].type != token.type
):
new_tokens.append(token)

# NOTE(willkg): We currently don't handle AtRule or ParseError and
# so both get silently thrown out

if not new_tokens:
return ""

return tinycss2.serialize(new_tokens).strip()
68 changes: 68 additions & 0 deletions hawc/apps/common/clean/sanitize_html.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""Validate html."""

import nh3

from .sanitize_css import CSSSanitizer

valid_html_tags = {
"a",
"blockquote",
"br",
"div",
"em",
"h1",
"h2",
"h3",
"h4",
"h5",
"li",
"mark",
"ol",
"p",
"span",
"strong",
"sub",
"sup",
"s",
"ul",
"u",
}

valid_html_attrs = {
"*": {"style"},
"a": {"class", "href"},
"span": {"class"},
"mark": {"class"},
"div": {"class"},
}

valid_css_properties = {"color", "background-color"}
valid_svg_properties = {}

css_sanitizer = CSSSanitizer(
allowed_css_properties=valid_css_properties,
allowed_svg_properties=valid_svg_properties,
)


def clean_html(html: str) -> str:
"""Cleans given HTML by removing invalid HTML tags, attributes, and CSS properties.
Note: inner text within invalid HTML tags will still be included.
Args:
html (str): HTML to clean
Returns:
str: cleaned HTML
"""

def attribute_filter(element, attribute, value):
"""Send styles to CSS sanitizer."""
if attribute == "style":
return css_sanitizer.sanitize_css(value)
return value

return nh3.clean(
html,
tags=valid_html_tags,
attributes=valid_html_attrs,
attribute_filter=attribute_filter,
)
3 changes: 2 additions & 1 deletion hawc/apps/common/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from django.urls import reverse

from . import validators, widgets
from .clean import sanitize_html
from .helper import PydanticToDjangoError

ASSESSMENT_UNIQUE_MESSAGE = "Must be unique for assessment (current value already exists)."
Expand Down Expand Up @@ -383,7 +384,7 @@ def __init__(self, *args, **kwargs):

def to_python(self, value):
value = super().to_python(value)
return validators.clean_html(value) if value else value
return sanitize_html.clean_html(value) if value else value

def validate(self, value):
super().validate(value)
Expand Down
56 changes: 2 additions & 54 deletions hawc/apps/common/validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,46 +3,16 @@
from functools import partial
from urllib import parse

import bleach
from bleach.css_sanitizer import CSSSanitizer
from django.core.exceptions import ValidationError
from django.core.validators import RegexValidator, URLValidator
from django.utils.encoding import force_str
from pydantic import BaseModel
from pydantic import ValidationError as PydanticValidationError

from .clean.sanitize_html import valid_html_tags

tag_regex = re.compile(r"</?(?P<tag>\w+)[^>]*>")
hyperlink_regex = re.compile(r"href\s*=\s*['\"](.*?)['\"]")

valid_html_tags = {
"a",
"blockquote",
"br",
"div",
"em",
"h1",
"h2",
"h3",
"h4",
"h5",
"li",
"ol",
"p",
"span",
"strong",
"sub",
"sup",
"s",
"ul",
"u",
}
valid_html_attrs = {
"*": ["style"],
"a": ["href", "rel", "target"],
"span": ["class", "data-pk", "data-type"],
"div": ["class", "data-pk", "data-type"],
}
valid_css_properties = {"color", "background-color"}
valid_scheme = {"", "http", "https"}
valid_netloc_endings = {
"canada.ca",
Expand All @@ -59,28 +29,6 @@
}


def clean_html(html: str) -> str:
"""
Cleans given HTML by removing invalid HTML tags, HTML properties, and CSS properties.

Note: inner text within invalid HTML tags will still be included.

Args:
html (str): HTML to clean

Returns:
str: cleaned HTML
"""
css_sanitizer = CSSSanitizer(allowed_css_properties=valid_css_properties)
return bleach.clean(
html,
tags=valid_html_tags,
attributes=valid_html_attrs,
css_sanitizer=css_sanitizer,
strip=True,
)


def validate_html_tags(html: str, field: str | None = None) -> str:
"""Html contains a subset of acceptable tags.

Expand Down
5 changes: 3 additions & 2 deletions hawc/apps/riskofbias/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from rest_framework import serializers

from ..common import validators
from ..common.clean import sanitize_html
from ..common.helper import SerializerHelper, tryParseInt
from ..myuser.models import HAWCUser
from ..myuser.serializers import HAWCUserSerializer
Expand Down Expand Up @@ -103,7 +104,7 @@ class Meta:

def validate_notes(self, value):
validators.validate_hyperlinks(value)
return validators.clean_html(value)
return sanitize_html.clean_html(value)


class RiskOfBiasScoreSerializer(serializers.ModelSerializer):
Expand All @@ -129,7 +130,7 @@ class Meta:

def validate_notes(self, value):
validators.validate_hyperlinks(value)
return validators.clean_html(value)
return sanitize_html.clean_html(value)


class StudyScoreSerializer(RiskOfBiasScoreSerializer):
Expand Down
5 changes: 3 additions & 2 deletions hawc/apps/summary/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from ..assessment.models import DoseUnits
from ..common import validators
from ..common.autocomplete import AutocompleteChoiceField
from ..common.clean import sanitize_html
from ..common.forms import (
BaseFormHelper,
CopyForm,
Expand Down Expand Up @@ -195,7 +196,7 @@ def clean_title(self):
def clean_caption(self):
caption = self.cleaned_data["caption"]
validators.validate_hyperlinks(caption)
return validators.clean_html(caption)
return sanitize_html.clean_html(caption)

def clean_evidence_type(self):
visual_type = self.cleaned_data["visual_type"]
Expand Down Expand Up @@ -679,7 +680,7 @@ def clean_title(self):
def clean_caption(self):
caption = self.cleaned_data["caption"]
validators.validate_hyperlinks(caption)
return validators.clean_html(caption)
return sanitize_html.clean_html(caption)


class DataPivotUploadForm(DataPivotForm):
Expand Down
3 changes: 2 additions & 1 deletion hawc/apps/summary/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from rest_framework import serializers

from ..common import validators
from ..common.clean import sanitize_html
from ..common.helper import SerializerHelper
from ..riskofbias.serializers import AssessmentRiskOfBiasSerializer
from . import constants, models
Expand Down Expand Up @@ -114,7 +115,7 @@ class Meta:

def validate_text(self, value):
validators.validate_hyperlinks(value)
return validators.clean_html(value)
return sanitize_html.clean_html(value)

def validate(self, data):
assessment = data["assessment"]
Expand Down
3 changes: 2 additions & 1 deletion requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ redis==5.0.1
requests==2.31.0
urllib3<2
pydantic==2.4.2
bleach[css]==6.1.0
nh3==0.2.15
tinycss2==1.2.1

# computational
numpy==1.26.1
Expand Down
Empty file.
Loading
Loading