From 347d371a7f52832726bf3d6a09c615d087016ec2 Mon Sep 17 00:00:00 2001 From: Santos Gallegos Date: Thu, 4 Mar 2021 20:16:13 -0500 Subject: [PATCH] Embed: replace pyquery with selectolax Don't introduce a new dep. MkDocs doesn't work since it doesn't have fjson files, I'm deleting the test for mkdocs for now, I'll add support for MkDocs once we have the parsing done in a more general way (soon!). --- .../embed/tests/data/mkdocs/latest/index.json | 6 - readthedocs/embed/tests/test_api.py | 63 ++-------- readthedocs/embed/utils.py | 20 ++- readthedocs/embed/views.py | 114 ++++++------------ requirements/pip.txt | 3 - 5 files changed, 65 insertions(+), 141 deletions(-) delete mode 100644 readthedocs/embed/tests/data/mkdocs/latest/index.json diff --git a/readthedocs/embed/tests/data/mkdocs/latest/index.json b/readthedocs/embed/tests/data/mkdocs/latest/index.json deleted file mode 100644 index 829508f65b4..00000000000 --- a/readthedocs/embed/tests/data/mkdocs/latest/index.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "content": "

MkDocs\uf0c1

\n

Project documentation with Markdown.

\n
\n

Overview\uf0c1

\n

MkDocs is a fast, simple and downright gorgeous static site\ngenerator that's geared towards building project documentation. Documentation\nsource files are written in Markdown, and configured with a single YAML\nconfiguration file.

\n

Host anywhere\uf0c1

\n

MkDocs builds completely static HTML sites that you can host on GitHub pages,\nAmazon S3, or anywhere else you choose.

\n

Great themes available\uf0c1

\n

There's a stack of good looking themes available for MkDocs. Choose between\nthe built in themes: mkdocs and readthedocs, select one of the 3rd\nparty themes in the MkDocs wiki, or build your own.

\n

Preview your site as you work\uf0c1

\n

The built-in dev-server allows you to preview your documentation as you're\nwriting it. It will even auto-reload and refresh your browser whenever you save\nyour changes.

\n

Easy to customize\uf0c1

\n

Get your project documentation looking just the way you want it by customizing\nthe theme.

\n
\n

Installation\uf0c1

\n

Install with a Package Manager\uf0c1

\n

If you have and use a package manager (such as apt-get, dnf, homebrew,\nyum, chocolatey, etc.) to install packages on your system, then you may\nwant to search for a \"MkDocs\" package and, if a recent version is available,\ninstall it with your package manager (check your system's documentation for\ndetails). That's it, you're done! Skip down to Getting Started.

\n

If your package manager does not have a recent \"MkDocs\" package, you can still\nuse your package manager to install \"Python\" and \"pip\". Then you can use pip to\ninstall MkDocs.

\n

Manual Installation\uf0c1

\n

In order to manually install MkDocs you'll need Python installed on your\nsystem, as well as the Python package manager, pip. You can check if you have\nthese already installed from the command line:

\n
$ python --version\nPython 2.7.2\n$ pip --version\npip 1.5.2\n
\n\n

MkDocs supports Python versions 2.7, 3.3, 3.4, 3.5 and pypy.

\n

Installing Python\uf0c1

\n

Install Python by downloading an installer appropriate for your system from\npython.org and running it.

\n
\n

Note

\n

If you are installing Python on Windows, be sure to check the box to have\nPython added to your PATH if the installer offers such an option (it's\nnormally off by default).

\n

\"Add

\n
\n

Installing pip\uf0c1

\n

If you're using a recent version of Python, the Python package manager, pip,\nis most likely installed by default. However, you may need to upgrade pip to the\nlasted version:

\n
pip install --upgrade pip\n
\n\n

If you need to install pip for the first time, download get-pip.py.\nThen run the following command to install it:

\n
python get-pip.py\n
\n\n

Installing MkDocs\uf0c1

\n

Install the mkdocs package using pip:

\n
pip install mkdocs\n
\n\n

You should now have the mkdocs command installed on your system. Run mkdocs\n--version to check that everything worked okay.

\n
$ mkdocs --version\nmkdocs, version 0.15.3\n
\n\n
\n

Note

\n

If you are using Windows, some of the above commands may not work\nout-of-the-box.

\n

A quick solution may be to preface every Python command with python -m\nlike this:

\n
python -m pip install mkdocs\npython -m mkdocs\n
\n

For a more permanent solution, you may need to edit your PATH environment\nvariable to include the Scripts directory of your Python installation.\nRecent versions of Python include a script to do this for you. Navigate to\nyour Python installation directory (for example C:\\Python34\\), open the\nTools, then Scripts folder, and run the win_add2path.py file by double\nclicking on it. Alternatively, you can download the script and run it\n(python win_add2path.py).

\n
\n
\n

Getting Started\uf0c1

\n

Getting started is super easy.

\n
mkdocs new my-project\ncd my-project\n
\n\n

Take a moment to review the initial project that has been created for you.

\n

\"The

\n

There's a single configuration file named mkdocs.yml, and a folder named\ndocs that will contain your documentation source files. Right now the docs\nfolder just contains a single documentation page, named index.md.

\n

MkDocs comes with a built-in dev-server that lets you preview your documentation\nas you work on it. Make sure you're in the same directory as the mkdocs.yml\nconfiguration file, and then start the server by running the mkdocs serve\ncommand:

\n
$ mkdocs serve\nINFO    -  Building documentation...\nINFO    -  Cleaning site directory\n[I 160402 15:50:43 server:271] Serving on http://127.0.0.1:8000\n[I 160402 15:50:43 handlers:58] Start watching changes\n[I 160402 15:50:43 handlers:60] Start detecting changes\n
\n\n

Open up http://127.0.0.1:8000/ in your browser, and you'll see the default\nhome page being displayed:

\n

\"The

\n

The dev-server also supports auto-reloading, and will rebuild your documentation\nwhenever anything in the configuration file, documentation directory, or theme\ndirectory changes.

\n

Open the docs/index.md document in your text editor of choice, change the\ninitial heading to MkLorum, and save your changes. Your browser will\nauto-reload and you should see your updated documentation immediately.

\n

Now try editing the configuration file: mkdocs.yml. Change the\nsite_name setting to MkLorum and save the file.

\n
site_name: MkLorum\n
\n\n

Your browser should immediately reload, and you'll see your new site name take\neffect.

\n

\"The

\n

Adding pages\uf0c1

\n

Now add a second page to your documentation:

\n
curl 'https://jaspervdj.be/lorem-markdownum/markdown.txt' > docs/about.md\n
\n\n

As our documentation site will include some navigation headers, you may want to\nedit the configuration file and add some information about the order, title, and\nnesting of each page in the navigation header by adding a pages\nsetting:

\n
site_name: MkLorum\npages:\n    - Home: index.md\n    - About: about.md\n
\n\n

Save your changes and you'll now see a navigation bar with Home and About\nitems on the left as well as Search, Previous, and Next items on the\nright.

\n

\"Screenshot\"

\n

Try the menu items and navigate back and forth between pages. Then click on\nSearch. A search dialog will appear, allowing you to search for any text on\nany page. Notice that the search results include every occurrence of the search\nterm on the site and links directly to the section of the page in which the\nsearch term appears. You get of all that with no effort or configuration on your\npart!

\n

\"Screenshot\"

\n

Theming our documentation\uf0c1

\n

Now change the configuration file to alter how the documentation is displayed by\nchanging the theme. Edit the mkdocs.yml file and add a theme setting:

\n
site_name: MkLorum\npages:\n    - Home: index.md\n    - About: about.md\ntheme: readthedocs\n
\n\n

Save your changes, and you'll see the ReadTheDocs theme being used.

\n

\"Screenshot\"

\n

Changing the Favicon Icon\uf0c1

\n

By default, MkDocs uses the MkDocs favicon icon. To use a different icon, create\nan img subdirectory in your docs_dir and copy your custom favicon.ico file\nto that directory. MkDocs will automatically detect and use that file as your\nfavicon icon.

\n

Building the site\uf0c1

\n

That's looking good. You're ready to deploy the first pass of your MkLorum\ndocumentation. First build the documentation:

\n
mkdocs build\n
\n\n

This will create a new directory, named site. Take a look inside the\ndirectory:

\n
$ ls site\nabout  fonts  index.html  license  search.html\ncss    img    js          mkdocs   sitemap.xml\n
\n\n

Notice that your source documentation has been output as two HTML files named\nindex.html and about/index.html. You also have various other media that's\nbeen copied into the site directory as part of the documentation theme. You\neven have a sitemap.xml file and mkdocs/search_index.json.

\n

If you're using source code control such as git you probably don't want to\ncheck your documentation builds into the repository. Add a line containing\nsite/ to your .gitignore file.

\n
echo "site/" >> .gitignore\n
\n\n

If you're using another source code control tool you'll want to check its\ndocumentation on how to ignore specific directories.

\n

After some time, files may be removed from the documentation but they will still\nreside in the site directory. To remove those stale files, just run mkdocs\nwith the --clean switch.

\n
mkdocs build --clean\n
\n\n

Other Commands and Options\uf0c1

\n

There are various other commands and options available. For a complete list of\ncommands, use the --help flag:

\n
mkdocs --help\n
\n\n

To view a list of options available on a given command, use the --help flag\nwith that command. For example, to get a list of all options available for the\nbuild command run the following:

\n
mkdocs build --help\n
\n\n

Deploying\uf0c1

\n

The documentation site that you just built only uses static files so you'll be\nable to host it from pretty much anywhere. GitHub project pages and Amazon\nS3 may be good hosting options, depending upon your needs. Upload the contents\nof the entire site directory to wherever you're hosting your website from and\nyou're done. For specific instructions on a number of common hosts, see the\nDeploying your Docs page.

\n

Getting help\uf0c1

\n

To get help with MkDocs, please use the discussion group, GitHub issues or\nthe MkDocs IRC channel #mkdocs on freenode.

", - "url": "/", - "language": "en", - "title": "Home" -} \ No newline at end of file diff --git a/readthedocs/embed/tests/test_api.py b/readthedocs/embed/tests/test_api.py index 83073a12852..e0d3bd80d2c 100644 --- a/readthedocs/embed/tests/test_api.py +++ b/readthedocs/embed/tests/test_api.py @@ -6,7 +6,7 @@ import pytest from django_dynamic_fixture import get -from pyquery import PyQuery +from selectolax.parser import HTMLParser from readthedocs.builds.constants import LATEST from readthedocs.embed.views import do_embed @@ -49,8 +49,15 @@ def _patch_sphinx_json_file(self, storage_mock, json_file, html_file): ) def _get_html_content(self, html_file): - section_content = [PyQuery(html_file.open().read()).outerHtml()] - return section_content + content = HTMLParser(html_file.open().read()) + # We override all links inside the embed, + # when doing so, the href attribute gets moved to the end. + # Do the same here. + for anchor in content.css('a'): + href = anchor.attributes.get('href') + if href and 'project.readthedocs.io' in href: + anchor.attrs['href'] = href + return content.body.child.html @mock.patch('readthedocs.embed.views.build_media_storage') def test_embed_unknown_section(self, storage_mock): @@ -119,7 +126,7 @@ def test_embed_sphinx(self, storage_mock, section): ) expected = { - 'content': section_content, + 'content': [section_content], 'headers': [ # TODO: return the full id here {'I Need Secrets (or Environment Variables) in my Build': '#'}, @@ -175,7 +182,7 @@ def test_embed_sphinx_bibtex(self, storage_mock, section): ) expected = { - 'content': section_content, + 'content': [section_content], 'headers': [ {'Getting Started': '#'}, {'Overview': '#overview'}, @@ -236,7 +243,7 @@ def test_embed_sphinx_glossary(self, storage_mock, section): ) expected = { - 'content': section_content, + 'content': [section_content], 'headers': [ {'Glossary': '#'}, ], @@ -250,47 +257,3 @@ def test_embed_sphinx_glossary(self, storage_mock, section): } assert response.data == expected - - @mock.patch('readthedocs.embed.views.build_media_storage') - def test_embed_mkdocs(self, storage_mock): - json_file = data_path / 'mkdocs/latest/index.json' - storage_mock.exists.return_value = True - storage_mock.open.side_effect = self._mock_open( - json_file.open().read() - ) - - self.version.documentation_type = MKDOCS - self.version.save() - - response = do_embed( - project=self.project, - version=self.version, - doc='index', - section='Installation', - path='index.html', - ) - - expected = { - 'content': mock.ANY, # too long to compare here - 'headers': [ - {'Overview': 'overview'}, - {'Installation': 'installation'}, - {'Getting Started': 'getting-started'}, - {'Adding pages': 'adding-pages'}, - {'Theming our documentation': 'theming-our-documentation'}, - {'Changing the Favicon Icon': 'changing-the-favicon-icon'}, - {'Building the site': 'building-the-site'}, - {'Other Commands and Options': 'other-commands-and-options'}, - {'Deploying': 'deploying'}, - {'Getting help': 'getting-help'}, - ], - 'url': 'http://project.readthedocs.io/en/latest/index.html', - 'meta': { - 'project': 'project', - 'version': 'latest', - 'doc': 'index', - 'section': 'Installation', - }, - } - - assert response.data == expected diff --git a/readthedocs/embed/utils.py b/readthedocs/embed/utils.py index 95f8640749f..014fcfdf934 100644 --- a/readthedocs/embed/utils.py +++ b/readthedocs/embed/utils.py @@ -3,10 +3,20 @@ def recurse_while_none(element): """Recursively find the leaf node with the ``href`` attribute.""" - if element.text is None and element.getchildren(): - return recurse_while_none(element.getchildren()[0]) + children = list(element.iter()) + if children: + return recurse_while_none(children[0]) - href = element.attrib.get('href') + href = element.attributes.get('href') if not href: - href = element.attrib.get('id') - return {element.text: href} + href = element.attributes.get('id') + return {element.text(): href} + + +def next_tag(element): + """Return the next non-text sibling of element.""" + while element: + element = element.next + if element.tag != '-text': + return element + return None diff --git a/readthedocs/embed/views.py b/readthedocs/embed/views.py index f9ad80481bf..963c953fbfa 100644 --- a/readthedocs/embed/views.py +++ b/readthedocs/embed/views.py @@ -10,18 +10,18 @@ from django.template.defaultfilters import slugify from django.utils.functional import cached_property from docutils.nodes import make_id -from pyquery import PyQuery as PQ # noqa from rest_framework import status from rest_framework.renderers import BrowsableAPIRenderer, JSONRenderer from rest_framework.response import Response from rest_framework.views import APIView +from selectolax.parser import HTMLParser from readthedocs.api.v2.permissions import IsAuthorizedToViewVersion from readthedocs.builds.constants import EXTERNAL from readthedocs.core.resolver import resolve from readthedocs.core.unresolver import unresolve from readthedocs.core.utils.extend import SettingsOverrideObject -from readthedocs.embed.utils import recurse_while_none +from readthedocs.embed.utils import next_tag, recurse_while_none from readthedocs.projects.models import Project from readthedocs.storage import build_media_storage @@ -35,7 +35,7 @@ def escape_selector(selector): return ret -def clean_links(obj, url): +def clean_links(node, url): """ Rewrite (internal) links to make them absolute. @@ -44,24 +44,24 @@ def clean_links(obj, url): 3. prepend URL (without filename) to internal relative links """ if url is None: - return obj + return node - for link in obj.find('a'): + for link in node.css('a'): base_url = urlparse(url) # We need to make all internal links, to be absolute - href = link.attrib['href'] + href = link.attributes.get('href') + if not href: + continue + parsed_href = urlparse(href) if parsed_href.scheme or parsed_href.path.startswith('/'): - # don't change external links + # don't change absolute paths/URLs continue if not parsed_href.path and parsed_href.fragment: # href="#section-link" - new_href = base_url.geturl() + href - link.attrib['href'] = new_href - continue - - if not base_url.path.endswith('/'): + link.attrs['href'] = base_url.geturl() + href + elif not base_url.path.endswith('/'): # internal relative link # href="../../another.html" and ``base_url`` is not HTMLDir # (e.g. /en/latest/deep/internal/section/page.html) @@ -73,11 +73,7 @@ def clean_links(obj, url): path, _ = base_url.path.rsplit('/', 1) # append the value of href (../../another.html) to the base URL. base_url = base_url._replace(path=path + '/') - - new_href = base_url.geturl() + href - link.attrib['href'] = new_href - - return obj + link.attrs['href'] = base_url.geturl() + href class EmbedAPIBase(APIView): @@ -276,7 +272,7 @@ def parse_sphinx(content, section, url): headers = [ recurse_while_none(element) - for element in PQ(toc)('a') + for element in HTMLParser(toc).css('a') ] if not section and headers: @@ -288,7 +284,7 @@ def parse_sphinx(content, section, url): if not section: return [], headers, None - body_obj = PQ(body) + body_obj = HTMLParser(body) escaped_section = escape_selector(section) elements_id = [ @@ -297,22 +293,24 @@ def parse_sphinx(content, section, url): make_id(escaped_section), f'module-{escaped_section}', ] - query_result = [] + query_result = None for element_id in elements_id: if not element_id: continue - query_result = body_obj(f'#{element_id}') + query_result = body_obj.css_first(f'#{element_id}') if query_result: break if not query_result: - selector = f':header:contains("{escaped_section}")' - query_result = body_obj(selector).parent() + selector = f'[header~={escaped_section}]' + query_result = body_obj.css_first(selector) + if query_result: + query_result = query_result.parent # Handle ``dt`` special cases - if len(query_result) == 1 and query_result[0].tag == 'dt': - parent = query_result.parent() - if 'glossary' in parent.attr('class'): + if query_result and query_result.tag == 'dt': + parent = query_result.parent + if 'glossary' in parent.attributes.get('class'): # Sphinx HTML structure for term glossary puts the ``id`` in the # ``dt`` element with the title of the term. In this case, we # need to return the next sibling which contains the definition @@ -324,8 +322,8 @@ def parse_sphinx(content, section, url): #
Text definition for the term
# ... # - query_result = query_result.next() - elif 'citation' in parent.attr('class'): + query_result = next_tag(query_result) + elif 'citation' in parent.attributes.get('class'): # Sphinx HTML structure for sphinxcontrib-bibtex puts the ``id`` in the # ``dt`` element with the title of the cite. In this case, we # need to return the next sibling which contains the cite itself. @@ -336,7 +334,7 @@ def parse_sphinx(content, section, url): #
Content of the cite
# ... # - query_result = query_result.next() + query_result = next_tag(query_result) else: # Sphinx HTML structure for definition list puts the ``id`` # the ``dt`` element, instead of the ``dl``. This makes @@ -352,58 +350,20 @@ def parse_sphinx(content, section, url): # query_result = parent - def dump(obj): - """Handle API-based doc HTML.""" - if obj[0].tag in ['span', 'h2']: - return obj.parent().outerHtml() - return obj.outerHtml() + # Return the outer html for these elements + if query_result and query_result.tag in ['span', 'h2'] and query_result.parent: + query_result = query_result.parent - ret = [ - dump(clean_links(PQ(obj), url)) - for obj in query_result - ] - return ret, headers, section + section_html = [] + if query_result: + clean_links(query_result, url) + section_html = [query_result.html] + + return section_html, headers, section def parse_mkdocs(content, section, url): # pylint: disable=unused-argument """Get the embed content for the section.""" ret = [] headers = [] - - if not content or not content.get('content'): - return (None, None, section) - - body = content['content'] - for element in PQ(body)('h2'): - headers.append(recurse_while_none(element)) - - if not section and headers: - # If no section is sent, return the content of the first one - section = list(headers[0].keys())[0].lower() - - if section: - body_obj = PQ(body) - escaped_section = escape_selector(section) - section_list = body_obj( - ':header:contains("{title}")'.format(title=str(escaped_section))) - for num in range(len(section_list)): - header2 = section_list.eq(num) - # h2_title = h2.text().strip() - # section_id = h2.attr('id') - h2_content = "" - next_p = header2.next() - while next_p: - if next_p[0].tag == 'h2': - break - h2_html = next_p.outerHtml() - if h2_html: - h2_content += "\n%s\n" % h2_html - next_p = next_p.next() - if h2_content: - ret.append(h2_content) - # ret.append({ - # 'id': section_id, - # 'title': h2_title, - # 'content': h2_content, - # }) - return (ret, headers, section) + return ret, headers, section diff --git a/requirements/pip.txt b/requirements/pip.txt index 539cf25e327..0d1d8fbb2d0 100644 --- a/requirements/pip.txt +++ b/requirements/pip.txt @@ -51,9 +51,6 @@ elasticsearch-dsl==7.3.0 # pyup: <8.0 django-elasticsearch-dsl==7.1.4 # pyup: <8.0 selectolax==0.2.10 -# embed -pyquery==1.4.3 - # NOTE: this dep can be removed in python 3.7 in favor of ``date.fromisoformat`` python-dateutil==2.8.1