NotebookRunner: Run all notebooks in repo directory

* NotebookRunner business now runs all notebooks in a repo by default * Add `disable_run_in_subdirs` option to NotebookRunner business config. This is `False` by default, so the default going forward is that all notebooks in a repo are run by this business. [This phalanx PR] sets this option to `true` in all existing configurations so as not to break any existing workflows. * Add `exclude_dirs` option to NotebookRunner business to list directories in which notebooks will NOT be run
lsst-sqre · May 19, 2024 · c870edb · c870edb
1 parent 6b8bf9c
commit c870edb
Show file tree

Hide file tree

Showing 8 changed files with 1,430 additions and 8 deletions.
diff --git a/requirements/dev.in b/requirements/dev.in
@@ -10,6 +10,7 @@
 # Testing
 asgi-lifespan
 coverage[toml]
+documenteer[guide]
 mypy
 pre-commit
 pytest

diff --git a/requirements/dev.txt b/requirements/dev.txt
diff --git a/src/mobu/models/business/notebookrunner.py b/src/mobu/models/business/notebookrunner.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+from pathlib import Path
 from typing import Literal
 
 from pydantic import Field
@@ -48,6 +49,23 @@ class NotebookRunnerOptions(NubladoBusinessOptions):
         description="Only used by the NotebookRunner",
     )
 
+    disable_run_in_subdirs: bool = Field(
+        False,
+        title="Don't run notebooks in subdirectories",
+        description="Only used by the NotebookRunner",
+    )
+
+    exclude_dirs: set[Path] = Field(
+        set(),
+        title="Any notebooks in these directories will not be run",
+        description=(
+            " These directories are relative to the repo root. Any notebooks"
+            " in child directories of these directories will also be excluded."
+            " Only used by the NotebookRunner.",
+        ),
+        examples=["some-dir", "some-dir/some-other-dir"],
+    )
+
 
 class NotebookRunnerConfig(BusinessConfig):
     """Configuration specialization for NotebookRunner."""

diff --git a/src/mobu/services/business/notebookrunner.py b/src/mobu/services/business/notebookrunner.py
@@ -57,6 +57,7 @@ def __init__(
         self._notebook: Path | None = None
         self._notebook_paths: list[Path] | None = None
         self._repo_dir: Path | None = None
+        self._exclude_paths: set[Path] | None = None
         self._running_code: str | None = None
         self._git = Git(logger=logger)
 
@@ -72,6 +73,9 @@ async def startup(self) -> None:
         if self._repo_dir is None:
             self._repo_dir = Path(TemporaryDirectory().name)
             await self.clone_repo()
+        self._exclude_paths = {
+            (self._repo_dir / path) for path in self.options.exclude_dirs
+        }
         self._notebook_paths = self.find_notebooks()
         self.logger.info("Repository cloned and ready")
         await super().startup()
@@ -87,15 +91,24 @@ async def clone_repo(self) -> None:
         with self.timings.start("clone_repo"):
             await self._git.clone("-b", branch, url, str(self._repo_dir))
 
+    def is_excluded(self, notebook: Path) -> bool:
+        # A notebook is excluded if any of its parent directories are excluded
+        return set(notebook.parents) & self._exclude_paths
+
     def find_notebooks(self) -> list[Path]:
         with self.timings.start("find_notebooks"):
             if self._repo_dir is None:
                 raise NotebookRepositoryError(
                     "Repository directory must be set", self.user.username
                 )
-            notebooks = [
-                p for p in self._repo_dir.iterdir() if p.suffix == ".ipynb"
-            ]
+            if self.options.disable_run_in_subdirs:
+                notebooks = list(self._repo_dir.glob("*.ipynb"))
+            else:
+                notebooks = [
+                    n
+                    for n in self._repo_dir.glob("**/*.ipynb")
+                    if not self.is_excluded(n)
+                ]
             if not notebooks:
                 msg = "No notebooks found in {self._repo_dir}"
                 raise NotebookRepositoryError(msg, self.user.username)

diff --git a/tests/business/notebookrunner_test.py b/tests/business/notebookrunner_test.py
@@ -59,9 +59,10 @@ async def test_run(
                     "options": {
                         "spawn_settle_time": 0,
                         "execution_idle_time": 0,
-                        "max_executions": 1,
+                        "max_executions": 10,
                         "repo_url": str(repo_path),
                         "repo_branch": "main",
+                        "disable_run_in_subdirs": True,
                         "working_directory": str(repo_path),
                     },
                 },
@@ -76,7 +77,7 @@ async def test_run(
             "business": {
                 "failure_count": 0,
                 "name": "NotebookRunner",
-                "notebook": "test-notebook.ipynb",
+                "notebook": ANY,
                 "success_count": 1,
                 "timings": ANY,
             },
@@ -93,9 +94,221 @@ async def test_run(
     # Get the log and check the cell output.
     r = await client.get("/mobu/flocks/test/monkeys/testuser1/log")
     assert r.status_code == 200
+
+    # Root notebook
+    assert "This is a test" in r.text
+    assert "This is another test" in r.text
+    assert "Final test" in r.text
+
+    # some-dir notebook
+    assert "Test some-dir" not in r.text
+
+    # some-other-dir notebook
+    assert "Test some-other-dir" not in r.text
+
+    # double-nested-dir notebook
+    assert "Test double-nested-dir" not in r.text
+
+    # Exceptions
+    assert "Exception thrown" not in r.text
+
+
+@pytest.mark.asyncio
+async def test_run_subdirs(
+    client: AsyncClient, respx_mock: respx.Router, tmp_path: Path
+) -> None:
+    mock_gafaelfawr(respx_mock)
+    cwd = Path.cwd()
+
+    # Set up a notebook repository.
+    source_path = Path(__file__).parent.parent / "notebooks"
+    repo_path = tmp_path / "notebooks"
+
+    shutil.copytree(str(source_path), str(repo_path))
+    # Remove exception notebook
+    (repo_path / "exception.ipynb").unlink()
+
+    # Set up git client
+    git = Git(repo=repo_path)
+    await git.init("--initial-branch=main")
+    await git.config("user.email", "[email protected]")
+    await git.config("user.name", "Git User")
+    for path in repo_path.iterdir():
+        if not path.name.startswith("."):
+            await git.add(str(path))
+    await git.commit("-m", "Initial commit")
+
+    # Start a monkey. We have to do this in a try/finally block since the
+    # runner will change working directories, which because working
+    # directories are process-global may mess up future tests.
+    try:
+        r = await client.put(
+            "/mobu/flocks",
+            json={
+                "name": "test",
+                "count": 1,
+                "user_spec": {"username_prefix": "testuser"},
+                "scopes": ["exec:notebook"],
+                "business": {
+                    "type": "NotebookRunner",
+                    "options": {
+                        "spawn_settle_time": 0,
+                        "execution_idle_time": 0,
+                        "max_executions": 10,
+                        "repo_url": str(repo_path),
+                        "repo_branch": "main",
+                        "working_directory": str(repo_path),
+                    },
+                },
+            },
+        )
+        assert r.status_code == 201
+
+        # Wait until we've finished one loop and check the results.
+        data = await wait_for_business(client, "testuser1")
+        assert data == {
+            "name": "testuser1",
+            "business": {
+                "failure_count": 0,
+                "name": "NotebookRunner",
+                "notebook": ANY,
+                "success_count": 1,
+                "timings": ANY,
+            },
+            "state": "RUNNING",
+            "user": {
+                "scopes": ["exec:notebook"],
+                "token": ANY,
+                "username": "testuser1",
+            },
+        }
+    finally:
+        os.chdir(cwd)
+
+    # Get the log and check the cell output.
+    r = await client.get("/mobu/flocks/test/monkeys/testuser1/log")
+    assert r.status_code == 200
+
+    # Root notebook
     assert "This is a test" in r.text
     assert "This is another test" in r.text
     assert "Final test" in r.text
+
+    # some-dir notebook
+    assert "Test some-dir" in r.text
+    assert "Another test some-dir" in r.text
+    assert "Final test some-dir" in r.text
+
+    # some-other-dir notebook
+    assert "Test some-other-dir" in r.text
+    assert "Another test some-other-dir" in r.text
+    assert "Final test some-other-dir" in r.text
+
+    # double-nested-dir notebook
+    assert "Test double-nested-dir" in r.text
+    assert "Another test double-nested-dir" in r.text
+    assert "Final test double-nested-dir" in r.text
+
+    # Exceptions
+    assert "Exception thrown" not in r.text
+
+
+@pytest.mark.asyncio
+async def test_exclude_dirs(
+    client: AsyncClient, respx_mock: respx.Router, tmp_path: Path
+) -> None:
+    mock_gafaelfawr(respx_mock)
+    cwd = Path.cwd()
+
+    # Set up a notebook repository.
+    source_path = Path(__file__).parent.parent / "notebooks"
+    repo_path = tmp_path / "notebooks"
+
+    shutil.copytree(str(source_path), str(repo_path))
+    # Remove exception notebook
+    (repo_path / "exception.ipynb").unlink()
+
+    # Set up git client
+    git = Git(repo=repo_path)
+    await git.init("--initial-branch=main")
+    await git.config("user.email", "[email protected]")
+    await git.config("user.name", "Git User")
+    for path in repo_path.iterdir():
+        if not path.name.startswith("."):
+            await git.add(str(path))
+    await git.commit("-m", "Initial commit")
+
+    # Start a monkey. We have to do this in a try/finally block since the
+    # runner will change working directories, which because working
+    # directories are process-global may mess up future tests.
+    try:
+        r = await client.put(
+            "/mobu/flocks",
+            json={
+                "name": "test",
+                "count": 1,
+                "user_spec": {"username_prefix": "testuser"},
+                "scopes": ["exec:notebook"],
+                "business": {
+                    "type": "NotebookRunner",
+                    "options": {
+                        "spawn_settle_time": 0,
+                        "execution_idle_time": 0,
+                        "max_executions": 10,
+                        "repo_url": str(repo_path),
+                        "repo_branch": "main",
+                        "working_directory": str(repo_path),
+                        "exclude_dirs": ["some-other-dir/nested-dir"],
+                    },
+                },
+            },
+        )
+        assert r.status_code == 201
+
+        # Wait until we've finished one loop and check the results.
+        data = await wait_for_business(client, "testuser1")
+        assert data == {
+            "name": "testuser1",
+            "business": {
+                "failure_count": 0,
+                "name": "NotebookRunner",
+                "notebook": ANY,
+                "success_count": 1,
+                "timings": ANY,
+            },
+            "state": "RUNNING",
+            "user": {
+                "scopes": ["exec:notebook"],
+                "token": ANY,
+                "username": "testuser1",
+            },
+        }
+    finally:
+        os.chdir(cwd)
+
+    # Get the log and check the cell output.
+    r = await client.get("/mobu/flocks/test/monkeys/testuser1/log")
+    assert r.status_code == 200
+
+    # Root notebook
+    assert "This is a test" in r.text
+    assert "This is another test" in r.text
+    assert "Final test" in r.text
+
+    # some-dir notebook
+    assert "Test some-dir" in r.text
+    assert "Another test some-dir" in r.text
+    assert "Final test some-dir" in r.text
+
+    # some-other-dir notebook
+    assert "Test some-other-dir" in r.text
+    assert "Another test some-other-dir" in r.text
+    assert "Final test some-other-dir" in r.text
+
+    # nested-dir notebook
+    assert "Test double-nested-dir" not in r.text
+
+    # Exceptions
     assert "Exception thrown" not in r.text
 
 
@@ -145,6 +358,7 @@ async def test_alert(
                     "max_executions": 1,
                     "repo_url": str(repo_path),
                     "repo_branch": "main",
+                    "disable_run_in_subdirs": True,
                 },
             },
         },