readthedocs · humitos · Sep 21, 2021 · Jul 5, 2021 · Jul 6, 2021 · Jul 6, 2021
diff --git a/readthedocs/embed/utils.py b/readthedocs/embed/utils.py
@@ -1,5 +1,6 @@
 """Embed utils."""
-
+from urllib.parse import urlparse
+from pyquery import PyQuery as PQ  # noqa
 
 def recurse_while_none(element):
     """Recursively find the leaf node with the ``href`` attribute."""
@@ -10,3 +11,55 @@ def recurse_while_none(element):
     if not href:
         href = element.attrib.get('id')
     return {element.text: href}
+
+
+def clean_links(obj, url, html_raw_response=False):
+    """
+    Rewrite (internal) links to make them absolute.
+
+    1. external links are not changed
+    2. prepend URL to links that are just fragments (e.g. #section)
+    3. prepend URL (without filename) to internal relative links
+    """
+
+    # TODO: do not depend on PyQuery
+    obj = PQ(obj)
+
+    if url is None:
+        return obj
+
+    for link in obj.find('a'):
+        base_url = urlparse(url)
+        # We need to make all internal links, to be absolute
+        href = link.attrib['href']
+        parsed_href = urlparse(href)
+        if parsed_href.scheme or parsed_href.path.startswith('/'):
+            # don't change external links
+            continue
+
+        if not parsed_href.path and parsed_href.fragment:
+            # href="#section-link"
+            new_href = base_url.geturl() + href
+            link.attrib['href'] = new_href
+            continue
+
+        if not base_url.path.endswith('/'):
+            # internal relative link
+            # href="../../another.html" and ``base_url`` is not HTMLDir
+            # (e.g. /en/latest/deep/internal/section/page.html)
+            # we want to remove the trailing filename (page.html) and use the rest as base URL
+            # The resulting absolute link should be
+            # https://slug.readthedocs.io/en/latest/deep/internal/section/../../another.html
+
+            # remove the filename (page.html) from the original document URL (base_url) and,
+            path, _ = base_url.path.rsplit('/', 1)
+            # append the value of href (../../another.html) to the base URL.
+            base_url = base_url._replace(path=path + '/')
+
+        new_href = base_url.geturl() + href
+        link.attrib['href'] = new_href
+
+    if html_raw_response:
+        return obj.outerHtml()
+
+    return obj
diff --git a/readthedocs/embed/v3/__init__.py b/readthedocs/embed/v3/__init__.py
diff --git a/readthedocs/embed/v3/urls.py b/readthedocs/embed/v3/urls.py
@@ -0,0 +1,8 @@
+from django.conf.urls import url
+
+from .views import EmbedAPI
+
+
+urlpatterns = [
+    url(r'', EmbedAPI.as_view(), name='embed_api_v3'),
+]
diff --git a/readthedocs/embed/v3/views.py b/readthedocs/embed/v3/views.py
@@ -0,0 +1,318 @@
+"""Views for the EmbedAPI v3 app."""
+
+import logging
+import re
+from urllib.parse import urlparse
+import requests
+
+from selectolax.parser import HTMLParser
+from pyquery import PyQuery as PQ  # noqa
+
+from django.conf import settings
+from django.core.cache import cache
+from django.shortcuts import get_object_or_404
+from django.utils.functional import cached_property
+from rest_framework import status
+from rest_framework.permissions import AllowAny
+from rest_framework.renderers import BrowsableAPIRenderer, JSONRenderer
+from rest_framework.response import Response
+from rest_framework.views import APIView
+
+from readthedocs.api.v2.mixins import CachedResponseMixin
+from readthedocs.core.unresolver import unresolve
+from readthedocs.core.utils.extend import SettingsOverrideObject
+from readthedocs.embed.utils import clean_links
+from readthedocs.projects.constants import PUBLIC
+from readthedocs.storage import build_media_storage
+
+log = logging.getLogger(__name__)
+
+
+
+class EmbedAPIBase(CachedResponseMixin, APIView):
+
+    # pylint: disable=line-too-long
+
+    """
+    Embed a section of content from any Read the Docs page.
+
+    ### Arguments
+
+    * url (with fragment) (required)
+    * doctool
+    * doctoolversion
+
+    ### Example
+
+    GET https://readthedocs.org/api/v3/embed/?url=https://docs.readthedocs.io/en/latest/features.html%23#full-text-search
+
+    """  # noqa
+
+    permission_classes = [AllowAny]
+    renderer_classes = [JSONRenderer, BrowsableAPIRenderer]
+
+    @cached_property
+    def unresolved_url(self):
+        url = self.request.GET.get('url')
+        if not url:
+            return None
+        return unresolve(url)
+
+    def _download_page_content(self, url):
+        cache_key = f'embed-api-{url}'
+        cached_response = cache.get(cache_key)
+        if cached_response:
+            log.debug('Cached response. url=%s', url)
+            return cached_response
+
+        try:
+            response = requests.get(url, timeout=1)
+        except requests.exceptions.TooManyRedirects:
+            log.warning('Too many redirects. url=%s', url)
+            return
+        except Exception:  # noqa
+            log.warning('There was an error reading the URL requested. url=%s', url)
+            return
+
+        if response.ok:
+            cache.set(
+                cache_key,
+                response.text,
+                timeout=settings.RTD_EMBED_API_PAGE_CACHE_TIMEOUT,
+            )
+            return response.text
+
+    def _get_page_content_from_storage(self):
+        project = self.unresolved_url.project
+        version = get_object_or_404(
+            project.versions,
+            slug=self.unresolved_url.version_slug,
+            # Only allow PUBLIC versions when getting the content from our
+            # storage for privacy/security reasons
+            privacy_level=PUBLIC,
+        )
+        storage_path = project.get_storage_path(
+            'html',
+            version_slug=version.slug,
+            include_file=False,
+            version_type=version.type,
+        )
+        file_path = build_media_storage.join(
+            storage_path,
+            self.unresolved_url.filename,
+        )
+        try:
+            with build_media_storage.open(file_path) as fd:
+                return fd.read()
+        except Exception:  # noqa
+            log.warning('Unable to read file. file_path=%s', file_path)
+
+        return None
+
+    def _get_content_by_fragment(self, url, fragment, external, doctool, doctoolversion):
+        if external:
+            url_without_fragment = urlparse(url)._replace(fragment='').geturl()
+            page_content = self._download_page_content(url_without_fragment)
+        else:
+            page_content = self._get_page_content_from_storage()
+
+        return self._parse_based_on_doctool(page_content, fragment, doctool, doctoolversion)
+
+    def _find_main_node(self, html):
 def _get_main_node(self, html): 
 def _get_main_node(self, html): 
+        main_node = html.css_first('[role=main]')
+        if main_node:
+            log.info('Main node found. selector=[role=main]')
+            return main_node
+
+        main_node = html.css_first('main')
+        if main_node:
+            log.info('Main node found. selector=main')
+            return main_node
+
+        first_header = html.body.css_first('h1')
+        if first_header:
+            log.info('Main node found. selector=h1')
+            return first_header.parent
+
+    def _parse_based_on_doctool(self, page_content, fragment, doctool, doctoolversion):
+        node = None
+        if fragment:
+            selector = f'#{fragment}'
+            node = HTMLParser(page_content).css_first(selector)
+        else:
+            html = HTMLParser(page_content)
+            node = self._find_main_node(html)
+
+        if not node:
+            return
+
+        if doctool == 'sphinx':
+            # Handle ``dt`` special cases
+            if node.tag == 'dt':
+                if 'glossary' in node.parent.attributes.get('class'):
+                    # Sphinx HTML structure for term glossary puts the ``id`` in the
+                    # ``dt`` element with the title of the term. In this case, we
+                    # return the parent node which contains the definition list
+                    # and remove all ``dt/dd`` that are not the requested one
+
+                    # Structure:
+                    # <dl class="glossary docutils">
+                    # <dt id="term-definition">definition</dt>
+                    # <dd>Text definition for the term</dd>
+                    # ...
+                    # </dl>
+                    parent = node.parent
+                    for n in parent.traverse():
+                        if n not in (node, node.next):
+                            n.remove()
+                    node = node.parent
+
+                elif 'citation' in node.parent.attributes.get('class'):
+                    # Sphinx HTML structure for sphinxcontrib-bibtex puts the ``id`` in the
+                    # ``dt`` element with the title of the cite. In this case, we
+                    # return the parent node which contains the definition list
+                    # and remove all ``dt/dd`` that are not the requested one
+
+                    # Structure:
+                    # <dl class="citation">
+                    # <dt id="cite-id"><span><a>Title of the cite</a></span></dt>
+                    # <dd>Content of the cite</dd>
+                    # ...
+                    # </dl>
+                    parent = node.parent
+                    for n in parent.traverse():
+                        if n not in (node, node.next):
+                            n.remove()
+                    node = node.parent
+
+                else:
+                    # Sphinx HTML structure for definition list puts the ``id``
+                    # the ``dt`` element, instead of the ``dl``. This makes
+                    # the backend to return just the title of the definition. If we
+                    # detect this case, we return the parent with the whole ``dl`` tag
+
+                    # Structure:
+                    # <dl class="confval">
+                    # <dt id="confval-config">
+                    # <code class="descname">config</code>
+                    # <a class="headerlink" href="#confval-config">¶</a></dt>
+                    # <dd><p>Text with a description</p></dd>
+                    # </dl>
+                    node = node.parent
+
+        return node.html
+
+    def get(self, request):
+        url = request.GET.get('url')
+        doctool = request.GET.get('doctool')
+        doctoolversion = request.GET.get('doctoolversion')
+        if not url:
+            return Response(
+                {
+                    'error': (
+                        'Invalid arguments. '
+                        'Please provide "url".'
+                    )
+                },
+                status=status.HTTP_400_BAD_REQUEST
+            )
+
+        if not all([doctool, doctoolversion]) and any([doctool, doctoolversion]):
+            return Response(
+                {
+                    'error': (
+                        'Invalid arguments. '
+                        'Please provide "doctool" and "doctoolversion" or none of them.'
+                    )
+                },
+                status=status.HTTP_400_BAD_REQUEST
+            )
+
+        # NOTE: ``readthedocs.core.unresolver.unresolve`` returns ``None`` when
+        # it can find the project in our database
+        external = self.unresolved_url is None
 if not project_slug: 
     return None 
 if not project_slug: 
     return None 
+
+        parsed_url = urlparse(url)
+        external_domain = parsed_url.netloc
+        if external and external_domain:
+            allowed_domain = False
+            for domain in settings.RTD_EMBED_API_EXTERNAL_DOMAINS:
+                if re.match(domain, external_domain):
+                    allowed_domain = True
+                    break
+
+            if not allowed_domain:
+                log.info('Domain not allowed. domain=%s url=%s', external_domain, url)
+                return Response(
+                    {
+                        'error': (
+                            'External domain not allowed. '
+                            f'domain={external_domain}'
+                        )
+                    },
+                    status=status.HTTP_400_BAD_REQUEST,
+                )
+
+            # Check rate-limit for this particular domain
+            cache_key = f'embed-api-{external_domain}'
+            cache.get_or_set(cache_key, 0, timeout=settings.RTD_EMBED_API_DOMAIN_RATE_LIMIT_TIMEOUT)
+            cache.incr(cache_key)
+            if cache.get(cache_key) > settings.RTD_EMBED_API_DOMAIN_RATE_LIMIT:
+                log.warning('Too many requests for this domain. domain=%s', external_domain)
+                return Response(
+                    {
+                        'error': (
+                            'Too many requests for this domain. '
+                            f'domain={external_domain}'
+                        )
+                    },
+                    status=status.HTTP_429_TOO_MANY_REQUESTS,
+                )
+
+        # NOTE: we could validate the fragment if we want. It must contain at
+        # least one character, cannot start with a number, and must not contain
+        # whitespaces (spaces, tabs, etc.).
+        fragment = parsed_url.fragment
+
+        content_requested = self._get_content_by_fragment(
+            url,
+            fragment,
+            external,
+            doctool,
+            doctoolversion,
+        )
+        if not content_requested:
+            log.warning('Identifier not found. url=%s', url)
+            return Response(
+                {
+                    'error': (
+                        "Can't find content for section: "
+                        f"url={url} fragment={fragment}"
+                    )
+                },
+                status=status.HTTP_404_NOT_FOUND
+            )
+
+        response = {
+            'url': url,
+            'fragment': fragment,
+            'content': clean_links(
+                content_requested,
+                url,
+                html_raw_response=True,
+            ),
+            'external': external,
+        }
+
+        if not external:
+            response.update({
+                'project': self.unresolved_url.project.slug,
+                'version': self.unresolved_url.version_slug,
+                'language': self.unresolved_url.language_slug,
+                'path': self.unresolved_url.filename,
+            })
+        return Response(response)
+
+
+class EmbedAPI(SettingsOverrideObject):
+    _default_class = EmbedAPIBase