Skip to content

Commit

Permalink
NotebookRunner: Run all notebooks in repo directory
Browse files Browse the repository at this point in the history
* NotebookRunner business now runs all notebooks in a repo by default
* Add `disable_run_in_subdirs` option to NotebookRunner business config.
  This is `False` by default, so the default going forward is that all
  notebooks in a repo are run by this business.
  [This phalanx PR] sets this option to `true` in all existing
  configurations so as not to break any existing workflows.
* Add `exclude_dirs` option to NotebookRunner business to list
  directories in which notebooks will NOT be run
  • Loading branch information
fajpunk committed May 19, 2024
1 parent 6b8bf9c commit c870edb
Show file tree
Hide file tree
Showing 8 changed files with 1,430 additions and 8 deletions.
1 change: 1 addition & 0 deletions requirements/dev.in
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
# Testing
asgi-lifespan
coverage[toml]
documenteer[guide]
mypy
pre-commit
pytest
Expand Down
858 changes: 855 additions & 3 deletions requirements/dev.txt

Large diffs are not rendered by default.

18 changes: 18 additions & 0 deletions src/mobu/models/business/notebookrunner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

from pathlib import Path
from typing import Literal

from pydantic import Field
Expand Down Expand Up @@ -48,6 +49,23 @@ class NotebookRunnerOptions(NubladoBusinessOptions):
description="Only used by the NotebookRunner",
)

disable_run_in_subdirs: bool = Field(
False,
title="Don't run notebooks in subdirectories",
description="Only used by the NotebookRunner",
)

exclude_dirs: set[Path] = Field(
set(),
title="Any notebooks in these directories will not be run",
description=(
" These directories are relative to the repo root. Any notebooks"
" in child directories of these directories will also be excluded."
" Only used by the NotebookRunner.",
),
examples=["some-dir", "some-dir/some-other-dir"],
)


class NotebookRunnerConfig(BusinessConfig):
"""Configuration specialization for NotebookRunner."""
Expand Down
19 changes: 16 additions & 3 deletions src/mobu/services/business/notebookrunner.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def __init__(
self._notebook: Path | None = None
self._notebook_paths: list[Path] | None = None
self._repo_dir: Path | None = None
self._exclude_paths: set[Path] | None = None
self._running_code: str | None = None
self._git = Git(logger=logger)

Expand All @@ -72,6 +73,9 @@ async def startup(self) -> None:
if self._repo_dir is None:
self._repo_dir = Path(TemporaryDirectory().name)
await self.clone_repo()
self._exclude_paths = {
(self._repo_dir / path) for path in self.options.exclude_dirs
}
self._notebook_paths = self.find_notebooks()
self.logger.info("Repository cloned and ready")
await super().startup()
Expand All @@ -87,15 +91,24 @@ async def clone_repo(self) -> None:
with self.timings.start("clone_repo"):
await self._git.clone("-b", branch, url, str(self._repo_dir))

def is_excluded(self, notebook: Path) -> bool:
# A notebook is excluded if any of its parent directories are excluded
return set(notebook.parents) & self._exclude_paths

def find_notebooks(self) -> list[Path]:
with self.timings.start("find_notebooks"):
if self._repo_dir is None:
raise NotebookRepositoryError(
"Repository directory must be set", self.user.username
)
notebooks = [
p for p in self._repo_dir.iterdir() if p.suffix == ".ipynb"
]
if self.options.disable_run_in_subdirs:
notebooks = list(self._repo_dir.glob("*.ipynb"))
else:
notebooks = [
n
for n in self._repo_dir.glob("**/*.ipynb")
if not self.is_excluded(n)
]
if not notebooks:
msg = "No notebooks found in {self._repo_dir}"
raise NotebookRepositoryError(msg, self.user.username)
Expand Down
218 changes: 216 additions & 2 deletions tests/business/notebookrunner_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,10 @@ async def test_run(
"options": {
"spawn_settle_time": 0,
"execution_idle_time": 0,
"max_executions": 1,
"max_executions": 10,
"repo_url": str(repo_path),
"repo_branch": "main",
"disable_run_in_subdirs": True,
"working_directory": str(repo_path),
},
},
Expand All @@ -76,7 +77,7 @@ async def test_run(
"business": {
"failure_count": 0,
"name": "NotebookRunner",
"notebook": "test-notebook.ipynb",
"notebook": ANY,
"success_count": 1,
"timings": ANY,
},
Expand All @@ -93,9 +94,221 @@ async def test_run(
# Get the log and check the cell output.
r = await client.get("/mobu/flocks/test/monkeys/testuser1/log")
assert r.status_code == 200

# Root notebook
assert "This is a test" in r.text
assert "This is another test" in r.text
assert "Final test" in r.text

# some-dir notebook
assert "Test some-dir" not in r.text

# some-other-dir notebook
assert "Test some-other-dir" not in r.text

# double-nested-dir notebook
assert "Test double-nested-dir" not in r.text

# Exceptions
assert "Exception thrown" not in r.text


@pytest.mark.asyncio
async def test_run_subdirs(
client: AsyncClient, respx_mock: respx.Router, tmp_path: Path
) -> None:
mock_gafaelfawr(respx_mock)
cwd = Path.cwd()

# Set up a notebook repository.
source_path = Path(__file__).parent.parent / "notebooks"
repo_path = tmp_path / "notebooks"

shutil.copytree(str(source_path), str(repo_path))
# Remove exception notebook
(repo_path / "exception.ipynb").unlink()

# Set up git client
git = Git(repo=repo_path)
await git.init("--initial-branch=main")
await git.config("user.email", "[email protected]")
await git.config("user.name", "Git User")
for path in repo_path.iterdir():
if not path.name.startswith("."):
await git.add(str(path))
await git.commit("-m", "Initial commit")

# Start a monkey. We have to do this in a try/finally block since the
# runner will change working directories, which because working
# directories are process-global may mess up future tests.
try:
r = await client.put(
"/mobu/flocks",
json={
"name": "test",
"count": 1,
"user_spec": {"username_prefix": "testuser"},
"scopes": ["exec:notebook"],
"business": {
"type": "NotebookRunner",
"options": {
"spawn_settle_time": 0,
"execution_idle_time": 0,
"max_executions": 10,
"repo_url": str(repo_path),
"repo_branch": "main",
"working_directory": str(repo_path),
},
},
},
)
assert r.status_code == 201

# Wait until we've finished one loop and check the results.
data = await wait_for_business(client, "testuser1")
assert data == {
"name": "testuser1",
"business": {
"failure_count": 0,
"name": "NotebookRunner",
"notebook": ANY,
"success_count": 1,
"timings": ANY,
},
"state": "RUNNING",
"user": {
"scopes": ["exec:notebook"],
"token": ANY,
"username": "testuser1",
},
}
finally:
os.chdir(cwd)

# Get the log and check the cell output.
r = await client.get("/mobu/flocks/test/monkeys/testuser1/log")
assert r.status_code == 200

# Root notebook
assert "This is a test" in r.text
assert "This is another test" in r.text
assert "Final test" in r.text

# some-dir notebook
assert "Test some-dir" in r.text
assert "Another test some-dir" in r.text
assert "Final test some-dir" in r.text

# some-other-dir notebook
assert "Test some-other-dir" in r.text
assert "Another test some-other-dir" in r.text
assert "Final test some-other-dir" in r.text

# double-nested-dir notebook
assert "Test double-nested-dir" in r.text
assert "Another test double-nested-dir" in r.text
assert "Final test double-nested-dir" in r.text

# Exceptions
assert "Exception thrown" not in r.text


@pytest.mark.asyncio
async def test_exclude_dirs(
client: AsyncClient, respx_mock: respx.Router, tmp_path: Path
) -> None:
mock_gafaelfawr(respx_mock)
cwd = Path.cwd()

# Set up a notebook repository.
source_path = Path(__file__).parent.parent / "notebooks"
repo_path = tmp_path / "notebooks"

shutil.copytree(str(source_path), str(repo_path))
# Remove exception notebook
(repo_path / "exception.ipynb").unlink()

# Set up git client
git = Git(repo=repo_path)
await git.init("--initial-branch=main")
await git.config("user.email", "[email protected]")
await git.config("user.name", "Git User")
for path in repo_path.iterdir():
if not path.name.startswith("."):
await git.add(str(path))
await git.commit("-m", "Initial commit")

# Start a monkey. We have to do this in a try/finally block since the
# runner will change working directories, which because working
# directories are process-global may mess up future tests.
try:
r = await client.put(
"/mobu/flocks",
json={
"name": "test",
"count": 1,
"user_spec": {"username_prefix": "testuser"},
"scopes": ["exec:notebook"],
"business": {
"type": "NotebookRunner",
"options": {
"spawn_settle_time": 0,
"execution_idle_time": 0,
"max_executions": 10,
"repo_url": str(repo_path),
"repo_branch": "main",
"working_directory": str(repo_path),
"exclude_dirs": ["some-other-dir/nested-dir"],
},
},
},
)
assert r.status_code == 201

# Wait until we've finished one loop and check the results.
data = await wait_for_business(client, "testuser1")
assert data == {
"name": "testuser1",
"business": {
"failure_count": 0,
"name": "NotebookRunner",
"notebook": ANY,
"success_count": 1,
"timings": ANY,
},
"state": "RUNNING",
"user": {
"scopes": ["exec:notebook"],
"token": ANY,
"username": "testuser1",
},
}
finally:
os.chdir(cwd)

# Get the log and check the cell output.
r = await client.get("/mobu/flocks/test/monkeys/testuser1/log")
assert r.status_code == 200

# Root notebook
assert "This is a test" in r.text
assert "This is another test" in r.text
assert "Final test" in r.text

# some-dir notebook
assert "Test some-dir" in r.text
assert "Another test some-dir" in r.text
assert "Final test some-dir" in r.text

# some-other-dir notebook
assert "Test some-other-dir" in r.text
assert "Another test some-other-dir" in r.text
assert "Final test some-other-dir" in r.text

# nested-dir notebook
assert "Test double-nested-dir" not in r.text

# Exceptions
assert "Exception thrown" not in r.text


Expand Down Expand Up @@ -145,6 +358,7 @@ async def test_alert(
"max_executions": 1,
"repo_url": str(repo_path),
"repo_branch": "main",
"disable_run_in_subdirs": True,
},
},
},
Expand Down
Loading

0 comments on commit c870edb

Please sign in to comment.