-
Notifications
You must be signed in to change notification settings - Fork 178
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(opentrons-ai-server): convert rst files to markdown file (#15703)
re AUTH-541 <!-- Thanks for taking the time to open a pull request! Please make sure you've read the "Opening Pull Requests" section of our Contributing Guide: https://github.com/Opentrons/opentrons/blob/edge/CONTRIBUTING.md#opening-pull-requests To ensure your code is reviewed quickly and thoroughly, please fill out the sections below to the best of your ability! --> # Overview <!-- Use this section to describe your pull-request at a high level. If the PR addresses any open issues, please tag the issues here. --> Converting an HTML file, which was converted from Python API documents (RST format files), to a markdown file. # Test Plan <!-- Use this section to describe the steps that you took to test your Pull Request. If you did not perform any testing provide justification why. OT-3 Developers: You should default to testing on actual physical hardware. Once again, if you did not perform testing against hardware, justify why. Note: It can be helpful to write a test plan before doing development Example Test Plan (HTTP API Change) - Verified that new optional argument `dance-party` causes the robot to flash its lights, move the pipettes, then home. - Verified that when you omit the `dance-party` option the robot homes normally - Added protocol that uses `dance-party` argument to G-Code Testing Suite - Ran protocol that did not use `dance-party` argument and everything was successful - Added unit tests to validate that changes to pydantic model are correct --> - Ensures a new build/docs/html/v2 folder containing an index.html file is created under the utils folder - Ensures the final markdown file has a version-aware filename and is created in the api/data folder - Ensure that the API version reference section has been removed from the main markdown file and written to a markdown file created in the api/data folder # Changelog <!-- List out the changes to the code in this PR. Please try your best to categorize your changes and describe what has changed and why. Example changelog: - Fixed app crash when trying to calibrate an illegal pipette - Added state to API to track pipette usage - Updated API docs to mention only two pipettes are supported IMPORTANT: MAKE SURE ANY BREAKING CHANGES ARE PROPERLY COMMUNICATED --> - add python script under opentrons-ai-server/api/utils folder to handle markdown conversion - create data folder to store output markdown files - add BeautifulSoup and markdownify to pipfile # Review requests <!-- Describe any requests for your reviewers here. --> # Risk assessment <!-- Carefully go over your pull request and look at the other parts of the codebase it may affect. Look for the possibility, even if you think it's small, that your change may affect some other part of the system - for instance, changing return tip behavior in protocol may also change the behavior of labware calibration. Identify the other parts of the system your codebase may affect, so that in addition to your own review and testing, other people who may not have the system internalized as much as you can focus their attention and testing there. --> --------- Co-authored-by: shiyaochen <[email protected]> Co-authored-by: shiyaochen <[email protected]>
- Loading branch information
1 parent
309b356
commit eca20ab
Showing
6 changed files
with
10,081 additions
and
173 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
6,212 changes: 6,212 additions & 0 deletions
6,212
opentrons-ai-server/api/data/python_api_219_docs.md
Large diffs are not rendered by default.
Oops, something went wrong.
3,313 changes: 3,313 additions & 0 deletions
3,313
opentrons-ai-server/api/data/python_api_219_reference.md
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,214 @@ | ||
import os.path | ||
import subprocess | ||
import uuid | ||
|
||
from bs4 import BeautifulSoup | ||
from bs4.element import Tag | ||
from markdownify import markdownify # type: ignore | ||
|
||
|
||
def run_sphinx_build(command: str) -> None: | ||
"""Run the sphinx command to convert rst files to a single HTML file.""" | ||
try: | ||
subprocess.run(command, check=True, shell=True) | ||
except subprocess.CalledProcessError as e: | ||
print(f"An error occurred while running Sphinx build: {e}") | ||
|
||
|
||
def remove_specific_logos(soup: BeautifulSoup) -> BeautifulSoup: | ||
"""Remove specific logos from the HTML.""" | ||
logos = soup.find_all("img", src=lambda x: x and ("opentrons-images/website" in x)) | ||
for logo in logos: | ||
logo.decompose() | ||
return soup | ||
|
||
|
||
def remove_all_images(soup: BeautifulSoup) -> BeautifulSoup: | ||
"""Remove all images from the HTML.""" | ||
all_images = soup.find_all("img") | ||
for img in all_images: | ||
img.decompose() | ||
return soup | ||
|
||
|
||
def remove_pilcrow_symbols(soup: BeautifulSoup) -> BeautifulSoup: | ||
"""Remove all pilcrow symbols from the HTML.""" | ||
pilcrow_symbols = soup.find_all("a", string="¶") | ||
for symbol in pilcrow_symbols: | ||
symbol.decompose() | ||
return soup | ||
|
||
|
||
def remove_list_items_containing_ot1(soup: BeautifulSoup) -> BeautifulSoup: | ||
"""Remove all <li> elements containing 'OT-1'.""" | ||
list_items = soup.find_all("li") | ||
for li in list_items: | ||
if "OT-1" in li.get_text(): | ||
li.decompose() | ||
return soup | ||
|
||
|
||
def remove_top_section(soup: BeautifulSoup) -> BeautifulSoup: | ||
"""Remove everything before a Python API docs header section.""" | ||
# Remove everything before the <div class="document"> element | ||
start_section = soup.find("div", class_="document") | ||
|
||
# Check if the section was found | ||
if not start_section: | ||
print("Start section not found in the HTML content.") | ||
return soup | ||
|
||
# Find the head tag and remove it | ||
head_tag = soup.find("head") | ||
if isinstance(head_tag, Tag): | ||
head_tag.decompose() | ||
|
||
# Remove all previous siblings of the start_section | ||
for previous in list(start_section.previous_siblings): | ||
previous.extract() | ||
|
||
# Remove the parent elements if they are no longer needed | ||
for parent in list(start_section.parents): | ||
if parent.name == "body": | ||
break | ||
if not parent.find_previous_siblings() and not parent.find_next_siblings(): | ||
parent.extract() | ||
|
||
return soup | ||
|
||
|
||
def remove_footer_content(soup: BeautifulSoup) -> BeautifulSoup: | ||
"""Remove the footer content from the HTML.""" | ||
footer_section = soup.find("footer") | ||
if isinstance(footer_section, Tag): | ||
footer_section.decompose() | ||
return soup | ||
|
||
|
||
def clean_html(soup: BeautifulSoup) -> BeautifulSoup: | ||
"""Clean up the unused features in the HTML file.""" | ||
soup = remove_specific_logos(soup) | ||
soup = remove_all_images(soup) | ||
soup = remove_pilcrow_symbols(soup) | ||
soup = remove_list_items_containing_ot1(soup) | ||
soup = remove_top_section(soup) | ||
soup = remove_footer_content(soup) | ||
return soup | ||
|
||
|
||
def extract_and_remove_api_reference(html_file_path: str, output_file_path: str) -> BeautifulSoup: | ||
"""Extract and remove the API Version 2 Reference section and write it to a Markdown file.""" | ||
|
||
with open(html_file_path, "r", encoding="utf-8") as file: | ||
html_content = file.read() | ||
|
||
soup = BeautifulSoup(html_content, "html.parser") | ||
soup = clean_html(soup) | ||
|
||
# Find the start and end points | ||
start_span = soup.find("span", id="document-new_protocol_api") | ||
if start_span is None: | ||
print("Start span not found.") | ||
return soup | ||
|
||
# Get the section to keep | ||
api_section = start_span.find_next_sibling("section", id="api-version-2-reference") | ||
if api_section is None: | ||
print("API section not found.") | ||
return soup | ||
|
||
# Create a BeautifulSoup object for the extracted section | ||
extracted_html = str(start_span) + str(api_section) | ||
reference_markdown = markdownify(extracted_html) | ||
|
||
# Write the extracted content to a Markdown file | ||
with open(output_file_path, "w", encoding="utf-8") as file: | ||
file.write(reference_markdown) | ||
|
||
# Remove it from the main markdown file | ||
if isinstance(start_span, Tag) and isinstance(api_section, Tag): | ||
start_span.decompose() | ||
api_section.decompose() | ||
|
||
return soup | ||
|
||
|
||
def extract_tab_content(soup: BeautifulSoup) -> tuple[BeautifulSoup, dict[str, str]]: | ||
"""Find all tabbed content sections and convert each tabbed section to markdown format.""" | ||
tab_sections = soup.find_all(class_="sphinx-tabs docutils container") | ||
tab_markdown = {} | ||
|
||
for _idx, tab_section in enumerate(tab_sections): | ||
tab_buttons = tab_section.find_all(class_="sphinx-tabs-tab") | ||
tab_panels = tab_section.find_all(class_="sphinx-tabs-panel") | ||
|
||
section_markdown = [] | ||
for button, panel in zip(tab_buttons, tab_panels, strict=False): | ||
section_markdown.append(f"### {button.text.strip()}\n") | ||
panel_content = markdownify(str(panel), strip=["div"]) | ||
section_markdown.append(panel_content) | ||
combined_section_markdown = "\n".join(section_markdown) + "\n\n" | ||
# Replace the original tab section with an unique placeholder in the soup | ||
placeholder = f"tabSectionIs{uuid.uuid4().hex}" | ||
tab_markdown[placeholder] = combined_section_markdown | ||
placeholder_tag = soup.new_tag("div") | ||
placeholder_tag.string = placeholder | ||
tab_section.replace_with(placeholder_tag) | ||
|
||
return soup, tab_markdown | ||
|
||
|
||
def convert_html_to_markdown(html_file_path: str, markdown_file_path: str, reference_file_path: str) -> None: | ||
"""Converts an HTML file to a Markdown file with specific modifications.""" | ||
|
||
soup = extract_and_remove_api_reference(html_file_path, reference_file_path) | ||
soup, tab_markdown = extract_tab_content(soup) | ||
|
||
modified_html_content = str(soup) | ||
full_markdown = markdownify(modified_html_content) | ||
|
||
for placeholder, section_md in tab_markdown.items(): | ||
if placeholder in full_markdown: | ||
full_markdown = full_markdown.replace(placeholder, section_md) | ||
|
||
with open(markdown_file_path, "w", encoding="utf-8") as file: | ||
file.write(full_markdown) | ||
|
||
|
||
def get_latest_version() -> str: | ||
"""Get the lastest docs version number.""" | ||
try: | ||
# Run the git command to get the latest tag | ||
command = "git tag -l 'docs@2*' --sort=-taggerdate | head -n 1" | ||
result = subprocess.run(command, capture_output=True, text=True, shell=True) | ||
# Extract the tag from the output and remove '.' | ||
tag = "".join(result.stdout.strip().split(".")) | ||
|
||
version = tag.split("@")[1] | ||
version = version.split("_")[0] | ||
return version | ||
except subprocess.CalledProcessError as e: | ||
print(f"An error occurred while getting the version: {e}") | ||
return "" | ||
|
||
|
||
def get_markdown_format() -> None: | ||
"""Generates a version-aware Markdown file from HTML documentation.""" | ||
current_version = get_latest_version() | ||
current_dir = os.path.dirname(__file__) | ||
|
||
docs_src_path = os.path.join("..", "api", "docs", "v2") | ||
build_html_path = os.path.join(current_dir, "build", "docs", "html", "v2") | ||
html_file_path = os.path.join(build_html_path, "index.html") | ||
markdown_file_path = os.path.join(current_dir, "..", "data", f"python_api_{current_version}_docs.md") | ||
reference_file_path = os.path.join(current_dir, "..", "data", f"python_api_{current_version}_reference.md") | ||
|
||
command = f"pipenv run sphinx-build -b singlehtml {docs_src_path} {build_html_path}" | ||
|
||
run_sphinx_build(command) | ||
|
||
convert_html_to_markdown(html_file_path, markdown_file_path, reference_file_path) | ||
|
||
|
||
if __name__ == "__main__": | ||
get_markdown_format() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
from unittest.mock import MagicMock, mock_open, patch | ||
|
||
import pytest | ||
from api.utils.convert_to_markdown import ( | ||
clean_html, | ||
convert_html_to_markdown, | ||
extract_and_remove_api_reference, | ||
extract_tab_content, | ||
get_latest_version, | ||
get_markdown_format, | ||
remove_all_images, | ||
remove_footer_content, | ||
remove_list_items_containing_ot1, | ||
remove_pilcrow_symbols, | ||
remove_specific_logos, | ||
remove_top_section, | ||
run_sphinx_build, | ||
) | ||
from bs4 import BeautifulSoup | ||
|
||
# Sample HTML content for testing | ||
sample_html = """ | ||
<html> | ||
<head></head> | ||
<body> | ||
<div class="document"></div> | ||
<footer></footer> | ||
<img src="opentrons-images/website/logo.png"> | ||
<a>¶</a> | ||
<li>OT-1</li> | ||
<span id="document-new_protocol_api"></span> | ||
<section id="api-version-2-reference"></section> | ||
<div class="sphinx-tabs docutils container"> | ||
<div class="sphinx-tabs-tab">Tab 1</div> | ||
<div class="sphinx-tabs-panel">Content 1</div> | ||
<div class="sphinx-tabs-tab">Tab 2</div> | ||
<div class="sphinx-tabs-panel">Content 2</div> | ||
</div> | ||
</body> | ||
</html> | ||
""" | ||
|
||
|
||
@pytest.fixture | ||
def soup() -> BeautifulSoup: | ||
return BeautifulSoup(sample_html, "html.parser") | ||
|
||
|
||
@pytest.mark.unit | ||
def test_run_sphinx_build() -> None: | ||
with patch("subprocess.run") as mock_run: | ||
run_sphinx_build("echo test") | ||
mock_run.assert_called_once_with("echo test", check=True, shell=True) | ||
|
||
|
||
@pytest.mark.unit | ||
def test_remove_specific_logos(soup: BeautifulSoup) -> None: | ||
soup = remove_specific_logos(soup) | ||
assert not soup.find_all("img", src="opentrons-images/website/logo.png") | ||
|
||
|
||
@pytest.mark.unit | ||
def test_remove_all_images(soup: BeautifulSoup) -> None: | ||
soup = remove_all_images(soup) | ||
assert not soup.find_all("img") | ||
|
||
|
||
@pytest.mark.unit | ||
def test_remove_pilcrow_symbols(soup: BeautifulSoup) -> None: | ||
soup = remove_pilcrow_symbols(soup) | ||
assert not soup.find_all("a", string="¶") | ||
|
||
|
||
@pytest.mark.unit | ||
def test_remove_list_items_containing_ot1(soup: BeautifulSoup) -> None: | ||
soup = remove_list_items_containing_ot1(soup) | ||
assert not soup.find_all("li", string="OT-1") | ||
|
||
|
||
@pytest.mark.unit | ||
def test_remove_top_section(soup: BeautifulSoup) -> None: | ||
soup = remove_top_section(soup) | ||
assert not soup.find("head") | ||
|
||
|
||
@pytest.mark.unit | ||
def test_remove_footer_content(soup: BeautifulSoup) -> None: | ||
soup = remove_footer_content(soup) | ||
assert not soup.find("footer") | ||
|
||
|
||
@pytest.mark.unit | ||
def test_clean_html(soup: BeautifulSoup) -> None: | ||
soup = clean_html(soup) | ||
assert not soup.find_all("img", src="opentrons-images/website/logo.png") | ||
assert not soup.find_all("img") | ||
assert not soup.find_all("a", string="¶") | ||
assert not soup.find_all("li", string="OT-1") | ||
assert not soup.find("head") | ||
assert not soup.find("footer") | ||
|
||
|
||
@pytest.mark.unit | ||
@patch("builtins.open", new_callable=mock_open, read_data=sample_html) | ||
def test_extract_and_remove_api_reference(mock_file: MagicMock, soup: BeautifulSoup) -> None: | ||
output_file_path = "output.md" | ||
html_file_path = "index.html" | ||
soup = extract_and_remove_api_reference(html_file_path, output_file_path) | ||
assert not soup.find("span", id="document-new_protocol_api") | ||
assert not soup.find("section", id="api-version-2-reference") | ||
|
||
|
||
@pytest.mark.unit | ||
def test_extract_tab_content(soup: BeautifulSoup) -> None: | ||
soup, tab_markdown = extract_tab_content(soup) | ||
assert len(tab_markdown) == 1 | ||
|
||
|
||
@pytest.mark.unit | ||
@patch("builtins.open", new_callable=mock_open) | ||
def test_convert_html_to_markdown(mock_file: MagicMock, soup: BeautifulSoup) -> None: | ||
html_file_path = "index.html" | ||
markdown_file_path = "output.md" | ||
reference_file_path = "reference.md" | ||
convert_html_to_markdown(html_file_path, markdown_file_path, reference_file_path) | ||
mock_file.assert_called() | ||
|
||
|
||
@pytest.mark.unit | ||
@patch("subprocess.run") | ||
def test_get_latest_version(mock_run: MagicMock) -> None: | ||
mock_run.return_value.stdout = "[email protected]_2\n" | ||
version = get_latest_version() | ||
assert version == "219" | ||
|
||
|
||
@pytest.mark.unit | ||
@patch("api.utils.convert_to_markdown.get_latest_version") | ||
@patch("api.utils.convert_to_markdown.run_sphinx_build") | ||
@patch("api.utils.convert_to_markdown.convert_html_to_markdown") | ||
def test_get_markdown_format(mock_convert: MagicMock, mock_build: MagicMock, mock_version: MagicMock) -> None: | ||
mock_version.return_value = "200" | ||
get_markdown_format() | ||
mock_build.assert_called() | ||
mock_convert.assert_called() |