Embed: replace pyquery with selectolax

Don't introduce a new dep. MkDocs doesn't work since it doesn't have fjson files, I'm deleting the test for mkdocs for now, I'll add support for MkDocs once we have the parsing done in a more general way (soon!).
readthedocs · Mar 5, 2021 · 347d371 · 347d371
1 parent 9b8e506
commit 347d371
Show file tree

Hide file tree

Showing 5 changed files with 65 additions and 141 deletions.
diff --git a/readthedocs/embed/tests/data/mkdocs/latest/index.json b/readthedocs/embed/tests/data/mkdocs/latest/index.json
diff --git a/readthedocs/embed/tests/test_api.py b/readthedocs/embed/tests/test_api.py
@@ -6,7 +6,7 @@
 
 import pytest
 from django_dynamic_fixture import get
-from pyquery import PyQuery
+from selectolax.parser import HTMLParser
 
 from readthedocs.builds.constants import LATEST
 from readthedocs.embed.views import do_embed
@@ -49,8 +49,15 @@ def _patch_sphinx_json_file(self, storage_mock, json_file, html_file):
         )
 
     def _get_html_content(self, html_file):
-        section_content = [PyQuery(html_file.open().read()).outerHtml()]
-        return section_content
+        content = HTMLParser(html_file.open().read())
+        # We override all links inside the embed,
+        # when doing so, the href attribute gets moved to the end.
+        # Do the same here.
+        for anchor in content.css('a'):
+            href = anchor.attributes.get('href')
+            if href and 'project.readthedocs.io' in href:
+                anchor.attrs['href'] = href
+        return content.body.child.html
 
     @mock.patch('readthedocs.embed.views.build_media_storage')
     def test_embed_unknown_section(self, storage_mock):
@@ -119,7 +126,7 @@ def test_embed_sphinx(self, storage_mock, section):
         )
 
         expected = {
-            'content': section_content,
+            'content': [section_content],
             'headers': [
                 # TODO: return the full id here
                 {'I Need Secrets (or Environment Variables) in my Build': '#'},
@@ -175,7 +182,7 @@ def test_embed_sphinx_bibtex(self, storage_mock, section):
         )
 
         expected = {
-            'content': section_content,
+            'content': [section_content],
             'headers': [
                 {'Getting Started': '#'},
                 {'Overview': '#overview'},
@@ -236,7 +243,7 @@ def test_embed_sphinx_glossary(self, storage_mock, section):
         )
 
         expected = {
-            'content': section_content,
+            'content': [section_content],
             'headers': [
                 {'Glossary': '#'},
             ],
@@ -250,47 +257,3 @@ def test_embed_sphinx_glossary(self, storage_mock, section):
         }
 
         assert response.data == expected
-
-    @mock.patch('readthedocs.embed.views.build_media_storage')
-    def test_embed_mkdocs(self, storage_mock):
-        json_file = data_path / 'mkdocs/latest/index.json'
-        storage_mock.exists.return_value = True
-        storage_mock.open.side_effect = self._mock_open(
-            json_file.open().read()
-        )
-
-        self.version.documentation_type = MKDOCS
-        self.version.save()
-
-        response = do_embed(
-            project=self.project,
-            version=self.version,
-            doc='index',
-            section='Installation',
-            path='index.html',
-        )
-
-        expected = {
-            'content': mock.ANY,  # too long to compare here
-            'headers': [
-                {'Overview': 'overview'},
-                {'Installation': 'installation'},
-                {'Getting Started': 'getting-started'},
-                {'Adding pages': 'adding-pages'},
-                {'Theming our documentation': 'theming-our-documentation'},
-                {'Changing the Favicon Icon': 'changing-the-favicon-icon'},
-                {'Building the site': 'building-the-site'},
-                {'Other Commands and Options': 'other-commands-and-options'},
-                {'Deploying': 'deploying'},
-                {'Getting help': 'getting-help'},
-            ],
-            'url': 'http://project.readthedocs.io/en/latest/index.html',
-            'meta': {
-                'project': 'project',
-                'version': 'latest',
-                'doc': 'index',
-                'section': 'Installation',
-            },
-        }
-
-        assert response.data == expected
diff --git a/readthedocs/embed/utils.py b/readthedocs/embed/utils.py
@@ -3,10 +3,20 @@
 
 def recurse_while_none(element):
     """Recursively find the leaf node with the ``href`` attribute."""
-    if element.text is None and element.getchildren():
-        return recurse_while_none(element.getchildren()[0])
+    children = list(element.iter())
+    if children:
+        return recurse_while_none(children[0])
 
-    href = element.attrib.get('href')
+    href = element.attributes.get('href')
     if not href:
-        href = element.attrib.get('id')
-    return {element.text: href}
+        href = element.attributes.get('id')
+    return {element.text(): href}
+
+
+def next_tag(element):
+    """Return the next non-text sibling of element."""
+    while element:
+        element = element.next
+        if element.tag != '-text':
+            return element
+    return None