Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(opentrons-ai-server): convert rst files to markdown file #15703

Merged
merged 16 commits into from
Jul 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions opentrons-ai-server/Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ ddtrace = "==2.9.2"
pydantic-settings = "==2.3.4"
pyjwt = {extras = ["crypto"], version = "*"}
python-json-logger = "==2.0.7"
beautifulsoup4 = "==4.12.3"
markdownify = "==0.13.1"

[dev-packages]
docker = "==7.1.0"
Expand All @@ -27,6 +29,7 @@ boto3-stubs = "==1.34.114"
rich = "==13.7.1"
cryptography = "==42.0.7"
types-docker = "==7.0.0.20240528"
types-beautifulsoup4 = "*"

[requires]
python_version = "3.12"
Expand Down
367 changes: 194 additions & 173 deletions opentrons-ai-server/Pipfile.lock

Large diffs are not rendered by default.

6,212 changes: 6,212 additions & 0 deletions opentrons-ai-server/api/data/python_api_219_docs.md

Large diffs are not rendered by default.

3,313 changes: 3,313 additions & 0 deletions opentrons-ai-server/api/data/python_api_219_reference.md

Large diffs are not rendered by default.

214 changes: 214 additions & 0 deletions opentrons-ai-server/api/utils/convert_to_markdown.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
import os.path
import subprocess
import uuid

from bs4 import BeautifulSoup
from bs4.element import Tag
from markdownify import markdownify # type: ignore


def run_sphinx_build(command: str) -> None:
"""Run the sphinx command to convert rst files to a single HTML file."""
try:
subprocess.run(command, check=True, shell=True)
except subprocess.CalledProcessError as e:
print(f"An error occurred while running Sphinx build: {e}")


def remove_specific_logos(soup: BeautifulSoup) -> BeautifulSoup:
"""Remove specific logos from the HTML."""
logos = soup.find_all("img", src=lambda x: x and ("opentrons-images/website" in x))
for logo in logos:
logo.decompose()
return soup


def remove_all_images(soup: BeautifulSoup) -> BeautifulSoup:
"""Remove all images from the HTML."""
all_images = soup.find_all("img")
for img in all_images:
img.decompose()
return soup


def remove_pilcrow_symbols(soup: BeautifulSoup) -> BeautifulSoup:
"""Remove all pilcrow symbols from the HTML."""
pilcrow_symbols = soup.find_all("a", string="¶")
for symbol in pilcrow_symbols:
symbol.decompose()
return soup


def remove_list_items_containing_ot1(soup: BeautifulSoup) -> BeautifulSoup:
"""Remove all <li> elements containing 'OT-1'."""
list_items = soup.find_all("li")
for li in list_items:
if "OT-1" in li.get_text():
li.decompose()
return soup


def remove_top_section(soup: BeautifulSoup) -> BeautifulSoup:
"""Remove everything before a Python API docs header section."""
# Remove everything before the <div class="document"> element
start_section = soup.find("div", class_="document")

# Check if the section was found
if not start_section:
print("Start section not found in the HTML content.")
return soup

# Find the head tag and remove it
head_tag = soup.find("head")
if isinstance(head_tag, Tag):
head_tag.decompose()

# Remove all previous siblings of the start_section
for previous in list(start_section.previous_siblings):
previous.extract()

# Remove the parent elements if they are no longer needed
for parent in list(start_section.parents):
if parent.name == "body":
break
if not parent.find_previous_siblings() and not parent.find_next_siblings():
parent.extract()

return soup


def remove_footer_content(soup: BeautifulSoup) -> BeautifulSoup:
"""Remove the footer content from the HTML."""
footer_section = soup.find("footer")
if isinstance(footer_section, Tag):
footer_section.decompose()
return soup


def clean_html(soup: BeautifulSoup) -> BeautifulSoup:
"""Clean up the unused features in the HTML file."""
soup = remove_specific_logos(soup)
soup = remove_all_images(soup)
soup = remove_pilcrow_symbols(soup)
soup = remove_list_items_containing_ot1(soup)
soup = remove_top_section(soup)
soup = remove_footer_content(soup)
return soup


def extract_and_remove_api_reference(html_file_path: str, output_file_path: str) -> BeautifulSoup:
"""Extract and remove the API Version 2 Reference section and write it to a Markdown file."""

with open(html_file_path, "r", encoding="utf-8") as file:
html_content = file.read()

soup = BeautifulSoup(html_content, "html.parser")
soup = clean_html(soup)

# Find the start and end points
start_span = soup.find("span", id="document-new_protocol_api")
if start_span is None:
print("Start span not found.")
return soup

# Get the section to keep
api_section = start_span.find_next_sibling("section", id="api-version-2-reference")
if api_section is None:
print("API section not found.")
return soup

# Create a BeautifulSoup object for the extracted section
extracted_html = str(start_span) + str(api_section)
reference_markdown = markdownify(extracted_html)

# Write the extracted content to a Markdown file
with open(output_file_path, "w", encoding="utf-8") as file:
file.write(reference_markdown)

# Remove it from the main markdown file
if isinstance(start_span, Tag) and isinstance(api_section, Tag):
start_span.decompose()
api_section.decompose()

return soup


def extract_tab_content(soup: BeautifulSoup) -> tuple[BeautifulSoup, dict[str, str]]:
"""Find all tabbed content sections and convert each tabbed section to markdown format."""
tab_sections = soup.find_all(class_="sphinx-tabs docutils container")
tab_markdown = {}

for _idx, tab_section in enumerate(tab_sections):
tab_buttons = tab_section.find_all(class_="sphinx-tabs-tab")
tab_panels = tab_section.find_all(class_="sphinx-tabs-panel")

section_markdown = []
for button, panel in zip(tab_buttons, tab_panels, strict=False):
section_markdown.append(f"### {button.text.strip()}\n")
panel_content = markdownify(str(panel), strip=["div"])
section_markdown.append(panel_content)
combined_section_markdown = "\n".join(section_markdown) + "\n\n"
# Replace the original tab section with an unique placeholder in the soup
placeholder = f"tabSectionIs{uuid.uuid4().hex}"
tab_markdown[placeholder] = combined_section_markdown
placeholder_tag = soup.new_tag("div")
placeholder_tag.string = placeholder
tab_section.replace_with(placeholder_tag)

return soup, tab_markdown


def convert_html_to_markdown(html_file_path: str, markdown_file_path: str, reference_file_path: str) -> None:
"""Converts an HTML file to a Markdown file with specific modifications."""

soup = extract_and_remove_api_reference(html_file_path, reference_file_path)
soup, tab_markdown = extract_tab_content(soup)

modified_html_content = str(soup)
full_markdown = markdownify(modified_html_content)

for placeholder, section_md in tab_markdown.items():
if placeholder in full_markdown:
full_markdown = full_markdown.replace(placeholder, section_md)

with open(markdown_file_path, "w", encoding="utf-8") as file:
file.write(full_markdown)


def get_latest_version() -> str:
"""Get the lastest docs version number."""
try:
# Run the git command to get the latest tag
command = "git tag -l 'docs@2*' --sort=-taggerdate | head -n 1"
result = subprocess.run(command, capture_output=True, text=True, shell=True)
# Extract the tag from the output and remove '.'
tag = "".join(result.stdout.strip().split("."))

version = tag.split("@")[1]
version = version.split("_")[0]
return version
except subprocess.CalledProcessError as e:
print(f"An error occurred while getting the version: {e}")
return ""


def get_markdown_format() -> None:
"""Generates a version-aware Markdown file from HTML documentation."""
current_version = get_latest_version()
current_dir = os.path.dirname(__file__)

docs_src_path = os.path.join("..", "api", "docs", "v2")
build_html_path = os.path.join(current_dir, "build", "docs", "html", "v2")
html_file_path = os.path.join(build_html_path, "index.html")
markdown_file_path = os.path.join(current_dir, "..", "data", f"python_api_{current_version}_docs.md")
reference_file_path = os.path.join(current_dir, "..", "data", f"python_api_{current_version}_reference.md")

command = f"pipenv run sphinx-build -b singlehtml {docs_src_path} {build_html_path}"

run_sphinx_build(command)

convert_html_to_markdown(html_file_path, markdown_file_path, reference_file_path)


if __name__ == "__main__":
get_markdown_format()
145 changes: 145 additions & 0 deletions opentrons-ai-server/tests/test_convert_to_markdown.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
from unittest.mock import MagicMock, mock_open, patch

import pytest
from api.utils.convert_to_markdown import (
clean_html,
convert_html_to_markdown,
extract_and_remove_api_reference,
extract_tab_content,
get_latest_version,
get_markdown_format,
remove_all_images,
remove_footer_content,
remove_list_items_containing_ot1,
remove_pilcrow_symbols,
remove_specific_logos,
remove_top_section,
run_sphinx_build,
)
from bs4 import BeautifulSoup

# Sample HTML content for testing
sample_html = """
<html>
<head></head>
<body>
<div class="document"></div>
<footer></footer>
<img src="opentrons-images/website/logo.png">
<a>¶</a>
<li>OT-1</li>
<span id="document-new_protocol_api"></span>
<section id="api-version-2-reference"></section>
<div class="sphinx-tabs docutils container">
<div class="sphinx-tabs-tab">Tab 1</div>
<div class="sphinx-tabs-panel">Content 1</div>
<div class="sphinx-tabs-tab">Tab 2</div>
<div class="sphinx-tabs-panel">Content 2</div>
</div>
</body>
</html>
"""


@pytest.fixture
def soup() -> BeautifulSoup:
return BeautifulSoup(sample_html, "html.parser")


@pytest.mark.unit
def test_run_sphinx_build() -> None:
with patch("subprocess.run") as mock_run:
run_sphinx_build("echo test")
mock_run.assert_called_once_with("echo test", check=True, shell=True)


@pytest.mark.unit
def test_remove_specific_logos(soup: BeautifulSoup) -> None:
soup = remove_specific_logos(soup)
assert not soup.find_all("img", src="opentrons-images/website/logo.png")


@pytest.mark.unit
def test_remove_all_images(soup: BeautifulSoup) -> None:
soup = remove_all_images(soup)
assert not soup.find_all("img")


@pytest.mark.unit
def test_remove_pilcrow_symbols(soup: BeautifulSoup) -> None:
soup = remove_pilcrow_symbols(soup)
assert not soup.find_all("a", string="¶")


@pytest.mark.unit
def test_remove_list_items_containing_ot1(soup: BeautifulSoup) -> None:
soup = remove_list_items_containing_ot1(soup)
assert not soup.find_all("li", string="OT-1")


@pytest.mark.unit
def test_remove_top_section(soup: BeautifulSoup) -> None:
soup = remove_top_section(soup)
assert not soup.find("head")


@pytest.mark.unit
def test_remove_footer_content(soup: BeautifulSoup) -> None:
soup = remove_footer_content(soup)
assert not soup.find("footer")


@pytest.mark.unit
def test_clean_html(soup: BeautifulSoup) -> None:
soup = clean_html(soup)
assert not soup.find_all("img", src="opentrons-images/website/logo.png")
assert not soup.find_all("img")
assert not soup.find_all("a", string="¶")
assert not soup.find_all("li", string="OT-1")
assert not soup.find("head")
assert not soup.find("footer")


@pytest.mark.unit
@patch("builtins.open", new_callable=mock_open, read_data=sample_html)
def test_extract_and_remove_api_reference(mock_file: MagicMock, soup: BeautifulSoup) -> None:
output_file_path = "output.md"
html_file_path = "index.html"
soup = extract_and_remove_api_reference(html_file_path, output_file_path)
assert not soup.find("span", id="document-new_protocol_api")
assert not soup.find("section", id="api-version-2-reference")


@pytest.mark.unit
def test_extract_tab_content(soup: BeautifulSoup) -> None:
soup, tab_markdown = extract_tab_content(soup)
assert len(tab_markdown) == 1


@pytest.mark.unit
@patch("builtins.open", new_callable=mock_open)
def test_convert_html_to_markdown(mock_file: MagicMock, soup: BeautifulSoup) -> None:
html_file_path = "index.html"
markdown_file_path = "output.md"
reference_file_path = "reference.md"
convert_html_to_markdown(html_file_path, markdown_file_path, reference_file_path)
mock_file.assert_called()


@pytest.mark.unit
@patch("subprocess.run")
def test_get_latest_version(mock_run: MagicMock) -> None:
mock_run.return_value.stdout = "[email protected]_2\n"
version = get_latest_version()
assert version == "219"


@pytest.mark.unit
@patch("api.utils.convert_to_markdown.get_latest_version")
@patch("api.utils.convert_to_markdown.run_sphinx_build")
@patch("api.utils.convert_to_markdown.convert_html_to_markdown")
def test_get_markdown_format(mock_convert: MagicMock, mock_build: MagicMock, mock_version: MagicMock) -> None:
mock_version.return_value = "200"
get_markdown_format()
mock_build.assert_called()
mock_convert.assert_called()
Loading