fix: remove nested CDATA declarations in content (#231)

During OLX node creation, some XBlocks will wrap their entire contents with CDATA (e.g. the HTMLBlock in libraries). This can cause errors when that wrapped content contains its own CDATA sections, because CDATA sections cannot nest. This can happen in HTML documents where the contents of a <script> tag are sometimes enclosed in CDATA. Modern HTML does not require this, but it was a common practice when XHTML was an accepted standard, as this would make those documents easier to parse as XML. This commit will remove nested CDATA declarations while preserving their content, so that the top-level CDATA can work as expected.
openedx · Dec 11, 2024 · 8e239ad · 8e239ad
1 parent 11104d4
commit 8e239ad
Show file tree

Hide file tree

Showing 9 changed files with 262 additions and 3 deletions.
diff --git a/src/cc2olx/constants.py b/src/cc2olx/constants.py
@@ -0,0 +1 @@
+CDATA_PATTERN = r"<!\[CDATA\[(?P<content>.*?)\]\]>"
diff --git a/src/cc2olx/olx.py b/src/cc2olx/olx.py
@@ -8,7 +8,7 @@
 from cc2olx.iframe_link_parser import KalturaIframeLinkParser
 
 from cc2olx.qti import QtiExport
-from cc2olx.utils import element_builder, passport_file_parser
+from cc2olx.utils import clean_from_cdata, element_builder, passport_file_parser
 
 logger = logging.getLogger()
 
@@ -363,6 +363,7 @@ def _process_html(self, details):
         html = self._process_static_links(details["html"])
         if self.link_file:
             html, video_olx = self._process_html_for_iframe(html)
+        html = clean_from_cdata(html)
         txt = self.doc.createCDATASection(html)
         child.appendChild(txt)
         nodes.append(child)
@@ -434,6 +435,7 @@ def _create_discussion_node(self, details):
         node.setAttribute("discussion_target", details["title"])
         html_node = self.doc.createElement("html")
         txt = "MISSING CONTENT" if details["text"] is None else details["text"]
+        txt = clean_from_cdata(txt)
         txt = self.doc.createCDATASection(txt)
         html_node.appendChild(txt)
         return [html_node, node]

diff --git a/src/cc2olx/utils.py b/src/cc2olx/utils.py
@@ -5,6 +5,8 @@
 import csv
 import re
 
+from cc2olx.constants import CDATA_PATTERN
+
 logger = logging.getLogger()
 
 
@@ -108,3 +110,16 @@ def clean_file_name(filename: str):
 
     cleaned_name = re.sub(special_characters, "_", filename)
     return cleaned_name
+
+
+def clean_from_cdata(xml_string: str) -> str:
+    """
+    Deletes CDATA tag from XML string while keeping its content.
+
+    Args:
+        xml_string (str): original XML string to clean.
+
+    Returns:
+        str: cleaned XML string.
+    """
+    return re.sub(CDATA_PATTERN, r"\g<content>", xml_string, flags=re.DOTALL)
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -4,6 +4,7 @@
 import shutil
 import zipfile
 
+import xml.dom.minidom
 from pathlib import Path
 from tempfile import NamedTemporaryFile
 from xml.dom.minidom import parse
@@ -12,6 +13,7 @@
 
 from cc2olx.cli import parse_args
 from cc2olx.models import Cartridge
+from cc2olx.olx import OlxExport
 from cc2olx.settings import collect_settings
 
 
@@ -242,3 +244,64 @@ def transcript_file(fixtures_data_dir):
         fixtures_data_dir / "video_files/01___Intro_to_Knowledge_Based_AI/0 - Introductions.en.srt"
     )
     return transcript_file_path
+
+
+@pytest.fixture(scope="session")
+def html_without_cdata(fixtures_data_dir: Path) -> str:
+    """
+    HTML string that doesn't contain CDATA sections.
+
+    Args:
+        fixtures_data_dir (str): Path to the directory where fixture data is present.
+
+    Returns:
+        str: HTML string.
+    """
+    html_without_cdata_path = fixtures_data_dir / "html_files/html-without-cdata.html"
+    return html_without_cdata_path.read_text()
+
+
+@pytest.fixture(scope="session")
+def cdata_containing_html(fixtures_data_dir: Path) -> str:
+    """
+    HTML string that contains CDATA sections.
+
+    Args:
+        fixtures_data_dir (str): Path to the directory where fixture data is present.
+
+    Returns:
+        str: HTML string.
+    """
+    html_without_cdata_path = fixtures_data_dir / "html_files/cdata-containing-html.html"
+    return html_without_cdata_path.read_text()
+
+
+@pytest.fixture(scope="session")
+def expected_cleaned_cdata_containing_html(fixtures_data_dir: Path) -> str:
+    """
+    The string with expected HTML after cleaning from CDATA sections.
+
+    Args:
+        fixtures_data_dir (str): Path to the directory where fixture data is present.
+
+    Returns:
+        str: HTML string.
+    """
+    html_without_cdata_path = fixtures_data_dir / "html_files/cleaned-cdata-containing-html.html"
+    return html_without_cdata_path.read_text()
+
+
+@pytest.fixture
+def bare_olx_exporter(cartridge: Cartridge) -> OlxExport:
+    """
+    Provides bare OLX exporter.
+
+    Args:
+        cartridge (Cartridge): Cartridge class instance.
+
+    Returns:
+        OlxExport: OlxExport instance.
+    """
+    olx_exporter = OlxExport(cartridge)
+    olx_exporter.doc = xml.dom.minidom.Document()
+    return olx_exporter
diff --git a/tests/fixtures_data/html_files/cdata-containing-html.html b/tests/fixtures_data/html_files/cdata-containing-html.html
@@ -0,0 +1,15 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+    <head>
+        <title>CDATA containing HTML document</title>
+    </head>
+    <body>
+        <script type="text/javascript">
+            <![CDATA[
+            var htmlContent = "<div>Hello, world!</div>";
+            alert(htmlContent);
+            ]]>
+        </script>
+    </body>
+</html>
diff --git a/tests/fixtures_data/html_files/cleaned-cdata-containing-html.html b/tests/fixtures_data/html_files/cleaned-cdata-containing-html.html
@@ -0,0 +1,15 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+    <head>
+        <title>CDATA containing HTML document</title>
+    </head>
+    <body>
+        <script type="text/javascript">
+
+            var htmlContent = "<div>Hello, world!</div>";
+            alert(htmlContent);
+
+        </script>
+    </body>
+</html>
diff --git a/tests/fixtures_data/html_files/html-without-cdata.html b/tests/fixtures_data/html_files/html-without-cdata.html
@@ -0,0 +1,13 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+    <head>
+        <title>HTML document without CDATA</title>
+    </head>
+    <body>
+        <script type="text/javascript">
+            var htmlContent = "<div>Hello, world!</div>";
+            alert(htmlContent);
+        </script>
+    </body>
+</html>
diff --git a/tests/test_olx.py b/tests/test_olx.py
@@ -1,8 +1,12 @@
 import json
+from unittest.mock import Mock
+
+import lxml
+import xml.dom.minidom
+
 from cc2olx import olx
+
 from .utils import format_xml
-import xml.dom.minidom
-import lxml
 
 
 def test_olx_export_xml(cartridge, link_map_csv, studio_course_xml):
@@ -36,6 +40,54 @@ def test_process_link():
     )
 
 
+class TestOlXExporeterHTMLProcessing:
+    """
+    Test the OLX exporter for HTML parsing flow.
+    """
+
+    def test_html_cleaning_from_cdata(
+        self,
+        mocker,
+        bare_olx_exporter,
+        cdata_containing_html,
+        expected_cleaned_cdata_containing_html,
+    ):
+        """
+        Test that CDATA cleaning function is called during HTML processing.
+
+        Args:
+            mocker (MockerFixture): MockerFixture instance.
+            bare_olx_exporter (OlxExport): bare OLX exporter.
+            cdata_containing_html (str): HTML that contains CDATA tags.
+            expected_cleaned_cdata_containing_html (str): Expected HTML after
+                successful cleaning.
+        """
+        details = {"html": cdata_containing_html}
+
+        clean_from_cdata_mock = mocker.patch(
+            "cc2olx.olx.clean_from_cdata",
+            return_value=expected_cleaned_cdata_containing_html,
+        )
+
+        bare_olx_exporter._process_html(details)
+
+        clean_from_cdata_mock.assert_called_once()
+
+    def test_processed_html_content_is_wrapped_into_cdata(self, bare_olx_exporter, cdata_containing_html):
+        """
+        Test that processed HTML content is wrapped into CDATA section.
+
+        Args:
+            bare_olx_exporter (OlxExport): bare OLX exporter.
+            cdata_containing_html (str): HTML that contains CDATA tags.
+        """
+        details = {"html": cdata_containing_html}
+
+        result_html, *__ = bare_olx_exporter._process_html(details)
+
+        assert isinstance(result_html.childNodes[0], xml.dom.minidom.CDATASection)
+
+
 class TestOlXExporeterIframeParser:
     """
     Test the olx exporter for iframe link parsing flow
@@ -141,3 +193,51 @@ def test_policy_contains_advanced_module(self, cartridge, passports_csv, caplog)
         assert ["Missing LTI Passport for learning_tools_interoperability. Using default."] == [
             rec.message for rec in caplog.records
         ]
+
+
+class TestDiscussionParsing:
+    """
+    Test the OLX exporter for discussion parsing flow.
+    """
+
+    def test_discussion_content_cleaning_from_cdata(
+        self,
+        mocker,
+        bare_olx_exporter,
+        cdata_containing_html,
+        expected_cleaned_cdata_containing_html,
+    ):
+        """
+        Test that CDATA cleaning function is called during discussion parsing.
+
+        Args:
+            mocker (MockerFixture): MockerFixture instance.
+            bare_olx_exporter (OlxExport): bare OLX exporter.
+            cdata_containing_html (str): HTML that contains CDATA tags.
+            expected_cleaned_cdata_containing_html (str): Expected HTML after
+                successful cleaning.
+        """
+        details = {"dependencies": [], "title": Mock(), "text": cdata_containing_html}
+
+        clean_from_cdata_mock = mocker.patch(
+            "cc2olx.olx.clean_from_cdata",
+            return_value=expected_cleaned_cdata_containing_html,
+        )
+
+        bare_olx_exporter._create_discussion_node(details)
+
+        clean_from_cdata_mock.assert_called_once()
+
+    def test_discussion_decription_is_wrapped_into_cdata(self, bare_olx_exporter, cdata_containing_html):
+        """
+        Test that processed HTML content is wrapped into CDATA section.
+
+        Args:
+            bare_olx_exporter (OlxExport): bare OLX exporter.
+            cdata_containing_html (str): HTML that contains CDATA tags.
+        """
+        details = {"dependencies": [], "title": Mock(), "text": cdata_containing_html}
+
+        discussion_decription_html, __ = bare_olx_exporter._create_discussion_node(details)
+
+        assert isinstance(discussion_decription_html.childNodes[0], xml.dom.minidom.CDATASection)
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -0,0 +1,35 @@
+from cc2olx.utils import clean_from_cdata
+
+
+class TestXMLCleaningFromCDATA:
+    """
+    Test XML string cleaning from CDATA sections.
+    """
+
+    def test_cdata_containing_html_is_cleaned_successfully(
+        self,
+        cdata_containing_html: str,
+        expected_cleaned_cdata_containing_html: str,
+    ) -> None:
+        """
+        Test if CDATA tags are removed from HTML while their content is kept.
+
+        Args:
+            cdata_containing_html (str): HTML that contains CDATA tags.
+            expected_cleaned_cdata_containing_html (str): Expected HTML after
+                successful cleaning.
+        """
+        actual_cleaned_cdata_containing_html = clean_from_cdata(cdata_containing_html)
+
+        assert actual_cleaned_cdata_containing_html == expected_cleaned_cdata_containing_html
+
+    def test_html_without_cdata_remains_the_same_after_cleaning(self, html_without_cdata: str) -> None:
+        """
+        Test if HTML that doesn't contain CDATA tags remains the same.
+
+        Args:
+            html_without_cdata (str): HTML that doesn't contains CDATA tags.
+        """
+        actual_cleaned_html_without_cdata = clean_from_cdata(html_without_cdata)
+
+        assert actual_cleaned_html_without_cdata == html_without_cdata
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		CDATA_PATTERN = r"<!\[CDATA\[(?P<content>.*?)\]\]>"