Skip to content

Commit

Permalink
fix: remove nested CDATA declarations in content (#231)
Browse files Browse the repository at this point in the history
During OLX node creation, some XBlocks will wrap their entire contents
with CDATA (e.g. the HTMLBlock in libraries). This can cause errors
when that wrapped content contains its own CDATA sections, because
CDATA sections cannot nest.

This can happen in HTML documents where the contents of a <script> tag
are sometimes enclosed in CDATA. Modern HTML does not require this, but
it was a common practice when XHTML was an accepted standard, as this
would make those documents easier to parse as XML.

This commit will remove nested CDATA declarations while preserving
their content, so that the top-level CDATA can work as expected.
  • Loading branch information
myhailo-chernyshov-rg authored Dec 11, 2024
1 parent 11104d4 commit 8e239ad
Show file tree
Hide file tree
Showing 9 changed files with 262 additions and 3 deletions.
1 change: 1 addition & 0 deletions src/cc2olx/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CDATA_PATTERN = r"<!\[CDATA\[(?P<content>.*?)\]\]>"
4 changes: 3 additions & 1 deletion src/cc2olx/olx.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from cc2olx.iframe_link_parser import KalturaIframeLinkParser

from cc2olx.qti import QtiExport
from cc2olx.utils import element_builder, passport_file_parser
from cc2olx.utils import clean_from_cdata, element_builder, passport_file_parser

logger = logging.getLogger()

Expand Down Expand Up @@ -363,6 +363,7 @@ def _process_html(self, details):
html = self._process_static_links(details["html"])
if self.link_file:
html, video_olx = self._process_html_for_iframe(html)
html = clean_from_cdata(html)
txt = self.doc.createCDATASection(html)
child.appendChild(txt)
nodes.append(child)
Expand Down Expand Up @@ -434,6 +435,7 @@ def _create_discussion_node(self, details):
node.setAttribute("discussion_target", details["title"])
html_node = self.doc.createElement("html")
txt = "MISSING CONTENT" if details["text"] is None else details["text"]
txt = clean_from_cdata(txt)
txt = self.doc.createCDATASection(txt)
html_node.appendChild(txt)
return [html_node, node]
Expand Down
15 changes: 15 additions & 0 deletions src/cc2olx/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import csv
import re

from cc2olx.constants import CDATA_PATTERN

logger = logging.getLogger()


Expand Down Expand Up @@ -108,3 +110,16 @@ def clean_file_name(filename: str):

cleaned_name = re.sub(special_characters, "_", filename)
return cleaned_name


def clean_from_cdata(xml_string: str) -> str:
"""
Deletes CDATA tag from XML string while keeping its content.
Args:
xml_string (str): original XML string to clean.
Returns:
str: cleaned XML string.
"""
return re.sub(CDATA_PATTERN, r"\g<content>", xml_string, flags=re.DOTALL)
63 changes: 63 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import shutil
import zipfile

import xml.dom.minidom
from pathlib import Path
from tempfile import NamedTemporaryFile
from xml.dom.minidom import parse
Expand All @@ -12,6 +13,7 @@

from cc2olx.cli import parse_args
from cc2olx.models import Cartridge
from cc2olx.olx import OlxExport
from cc2olx.settings import collect_settings


Expand Down Expand Up @@ -242,3 +244,64 @@ def transcript_file(fixtures_data_dir):
fixtures_data_dir / "video_files/01___Intro_to_Knowledge_Based_AI/0 - Introductions.en.srt"
)
return transcript_file_path


@pytest.fixture(scope="session")
def html_without_cdata(fixtures_data_dir: Path) -> str:
"""
HTML string that doesn't contain CDATA sections.
Args:
fixtures_data_dir (str): Path to the directory where fixture data is present.
Returns:
str: HTML string.
"""
html_without_cdata_path = fixtures_data_dir / "html_files/html-without-cdata.html"
return html_without_cdata_path.read_text()


@pytest.fixture(scope="session")
def cdata_containing_html(fixtures_data_dir: Path) -> str:
"""
HTML string that contains CDATA sections.
Args:
fixtures_data_dir (str): Path to the directory where fixture data is present.
Returns:
str: HTML string.
"""
html_without_cdata_path = fixtures_data_dir / "html_files/cdata-containing-html.html"
return html_without_cdata_path.read_text()


@pytest.fixture(scope="session")
def expected_cleaned_cdata_containing_html(fixtures_data_dir: Path) -> str:
"""
The string with expected HTML after cleaning from CDATA sections.
Args:
fixtures_data_dir (str): Path to the directory where fixture data is present.
Returns:
str: HTML string.
"""
html_without_cdata_path = fixtures_data_dir / "html_files/cleaned-cdata-containing-html.html"
return html_without_cdata_path.read_text()


@pytest.fixture
def bare_olx_exporter(cartridge: Cartridge) -> OlxExport:
"""
Provides bare OLX exporter.
Args:
cartridge (Cartridge): Cartridge class instance.
Returns:
OlxExport: OlxExport instance.
"""
olx_exporter = OlxExport(cartridge)
olx_exporter.doc = xml.dom.minidom.Document()
return olx_exporter
15 changes: 15 additions & 0 deletions tests/fixtures_data/html_files/cdata-containing-html.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title>CDATA containing HTML document</title>
</head>
<body>
<script type="text/javascript">
<![CDATA[
var htmlContent = "<div>Hello, world!</div>";
alert(htmlContent);
]]>
</script>
</body>
</html>
15 changes: 15 additions & 0 deletions tests/fixtures_data/html_files/cleaned-cdata-containing-html.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title>CDATA containing HTML document</title>
</head>
<body>
<script type="text/javascript">

var htmlContent = "<div>Hello, world!</div>";
alert(htmlContent);

</script>
</body>
</html>
13 changes: 13 additions & 0 deletions tests/fixtures_data/html_files/html-without-cdata.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title>HTML document without CDATA</title>
</head>
<body>
<script type="text/javascript">
var htmlContent = "<div>Hello, world!</div>";
alert(htmlContent);
</script>
</body>
</html>
104 changes: 102 additions & 2 deletions tests/test_olx.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
import json
from unittest.mock import Mock

import lxml
import xml.dom.minidom

from cc2olx import olx

from .utils import format_xml
import xml.dom.minidom
import lxml


def test_olx_export_xml(cartridge, link_map_csv, studio_course_xml):
Expand Down Expand Up @@ -36,6 +40,54 @@ def test_process_link():
)


class TestOlXExporeterHTMLProcessing:
"""
Test the OLX exporter for HTML parsing flow.
"""

def test_html_cleaning_from_cdata(
self,
mocker,
bare_olx_exporter,
cdata_containing_html,
expected_cleaned_cdata_containing_html,
):
"""
Test that CDATA cleaning function is called during HTML processing.
Args:
mocker (MockerFixture): MockerFixture instance.
bare_olx_exporter (OlxExport): bare OLX exporter.
cdata_containing_html (str): HTML that contains CDATA tags.
expected_cleaned_cdata_containing_html (str): Expected HTML after
successful cleaning.
"""
details = {"html": cdata_containing_html}

clean_from_cdata_mock = mocker.patch(
"cc2olx.olx.clean_from_cdata",
return_value=expected_cleaned_cdata_containing_html,
)

bare_olx_exporter._process_html(details)

clean_from_cdata_mock.assert_called_once()

def test_processed_html_content_is_wrapped_into_cdata(self, bare_olx_exporter, cdata_containing_html):
"""
Test that processed HTML content is wrapped into CDATA section.
Args:
bare_olx_exporter (OlxExport): bare OLX exporter.
cdata_containing_html (str): HTML that contains CDATA tags.
"""
details = {"html": cdata_containing_html}

result_html, *__ = bare_olx_exporter._process_html(details)

assert isinstance(result_html.childNodes[0], xml.dom.minidom.CDATASection)


class TestOlXExporeterIframeParser:
"""
Test the olx exporter for iframe link parsing flow
Expand Down Expand Up @@ -141,3 +193,51 @@ def test_policy_contains_advanced_module(self, cartridge, passports_csv, caplog)
assert ["Missing LTI Passport for learning_tools_interoperability. Using default."] == [
rec.message for rec in caplog.records
]


class TestDiscussionParsing:
"""
Test the OLX exporter for discussion parsing flow.
"""

def test_discussion_content_cleaning_from_cdata(
self,
mocker,
bare_olx_exporter,
cdata_containing_html,
expected_cleaned_cdata_containing_html,
):
"""
Test that CDATA cleaning function is called during discussion parsing.
Args:
mocker (MockerFixture): MockerFixture instance.
bare_olx_exporter (OlxExport): bare OLX exporter.
cdata_containing_html (str): HTML that contains CDATA tags.
expected_cleaned_cdata_containing_html (str): Expected HTML after
successful cleaning.
"""
details = {"dependencies": [], "title": Mock(), "text": cdata_containing_html}

clean_from_cdata_mock = mocker.patch(
"cc2olx.olx.clean_from_cdata",
return_value=expected_cleaned_cdata_containing_html,
)

bare_olx_exporter._create_discussion_node(details)

clean_from_cdata_mock.assert_called_once()

def test_discussion_decription_is_wrapped_into_cdata(self, bare_olx_exporter, cdata_containing_html):
"""
Test that processed HTML content is wrapped into CDATA section.
Args:
bare_olx_exporter (OlxExport): bare OLX exporter.
cdata_containing_html (str): HTML that contains CDATA tags.
"""
details = {"dependencies": [], "title": Mock(), "text": cdata_containing_html}

discussion_decription_html, __ = bare_olx_exporter._create_discussion_node(details)

assert isinstance(discussion_decription_html.childNodes[0], xml.dom.minidom.CDATASection)
35 changes: 35 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from cc2olx.utils import clean_from_cdata


class TestXMLCleaningFromCDATA:
"""
Test XML string cleaning from CDATA sections.
"""

def test_cdata_containing_html_is_cleaned_successfully(
self,
cdata_containing_html: str,
expected_cleaned_cdata_containing_html: str,
) -> None:
"""
Test if CDATA tags are removed from HTML while their content is kept.
Args:
cdata_containing_html (str): HTML that contains CDATA tags.
expected_cleaned_cdata_containing_html (str): Expected HTML after
successful cleaning.
"""
actual_cleaned_cdata_containing_html = clean_from_cdata(cdata_containing_html)

assert actual_cleaned_cdata_containing_html == expected_cleaned_cdata_containing_html

def test_html_without_cdata_remains_the_same_after_cleaning(self, html_without_cdata: str) -> None:
"""
Test if HTML that doesn't contain CDATA tags remains the same.
Args:
html_without_cdata (str): HTML that doesn't contains CDATA tags.
"""
actual_cleaned_html_without_cdata = clean_from_cdata(html_without_cdata)

assert actual_cleaned_html_without_cdata == html_without_cdata

0 comments on commit 8e239ad

Please sign in to comment.