Skip to content

Commit

Permalink
Run all notebooks in repo directory with exclude option
Browse files Browse the repository at this point in the history
* NotebookRunner business now runs all notebooks in a repo by default
* Add `disable_run_in_subdirs` option to NotebookRunner business config.
  This is `False` by default, so the default going forward is that all
  notebooks in a repo are run by this business.
  [This phalanx PR] sets this option to `true` in all existing
  configurations so as not to break any existing workflows.
* Add `exclude_dirs` option to NotebookRunner business to list
  directories in which notebooks will NOT be run
  • Loading branch information
fajpunk committed May 20, 2024
1 parent 498015e commit 047d465
Show file tree
Hide file tree
Showing 8 changed files with 618 additions and 4 deletions.
4 changes: 4 additions & 0 deletions src/mobu/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,17 @@
from datetime import timedelta

__all__ = [
"NOTEBOOK_ALWAYS_EXCLUDE_DIRS",
"NOTEBOOK_REPO_URL",
"NOTEBOOK_REPO_BRANCH",
"TOKEN_LIFETIME",
"USERNAME_REGEX",
"WEBSOCKET_OPEN_TIMEOUT",
]

NOTEBOOK_ALWAYS_EXCLUDE_DIRS = {"mobu_exclude"}
"""NotebookRunner always excludes notebooks in these dirs."""

NOTEBOOK_REPO_URL = "https://github.com/lsst-sqre/notebook-demo.git"
"""Default notebook repository for NotebookRunner."""

Expand Down
20 changes: 19 additions & 1 deletion src/mobu/models/business/notebookrunner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,16 @@

from __future__ import annotations

from pathlib import Path
from typing import Literal

from pydantic import Field

from ...constants import NOTEBOOK_REPO_BRANCH, NOTEBOOK_REPO_URL
from ...constants import (
NOTEBOOK_ALWAYS_EXCLUDE_DIRS,
NOTEBOOK_REPO_BRANCH,
NOTEBOOK_REPO_URL,
)
from .base import BusinessConfig
from .nublado import NubladoBusinessData, NubladoBusinessOptions

Expand Down Expand Up @@ -48,6 +53,19 @@ class NotebookRunnerOptions(NubladoBusinessOptions):
description="Only used by the NotebookRunner",
)

exclude_dirs: set[Path] = Field(
set(),
title="Any notebooks in these directories will not be run",
description=(
" These directories are relative to the repo root. Any notebooks"
" in child directories of these directories will also be excluded."
" Regardless of this value, notebooks in these dirs will always"
f" be excluded: {NOTEBOOK_ALWAYS_EXCLUDE_DIRS}"
" Only used by the NotebookRunner."
),
examples=["some-dir", "some-dir/some-other-dir"],
)


class NotebookRunnerConfig(BusinessConfig):
"""Configuration specialization for NotebookRunner."""
Expand Down
16 changes: 15 additions & 1 deletion src/mobu/services/business/notebookrunner.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from httpx import AsyncClient
from structlog.stdlib import BoundLogger

from ...constants import NOTEBOOK_ALWAYS_EXCLUDE_DIRS
from ...exceptions import NotebookRepositoryError
from ...models.business.notebookrunner import (
NotebookRunnerData,
Expand Down Expand Up @@ -57,6 +58,7 @@ def __init__(
self._notebook: Path | None = None
self._notebook_paths: list[Path] | None = None
self._repo_dir: Path | None = None
self._exclude_paths: set[Path] = set()
self._running_code: str | None = None
self._git = Git(logger=logger)

Expand All @@ -72,6 +74,10 @@ async def startup(self) -> None:
if self._repo_dir is None:
self._repo_dir = Path(TemporaryDirectory().name)
await self.clone_repo()
exclude_dirs = self.options.exclude_dirs | NOTEBOOK_ALWAYS_EXCLUDE_DIRS
self._exclude_paths = {
(self._repo_dir / path) for path in exclude_dirs
}
self._notebook_paths = self.find_notebooks()
self.logger.info("Repository cloned and ready")
await super().startup()
Expand All @@ -87,14 +93,22 @@ async def clone_repo(self) -> None:
with self.timings.start("clone_repo"):
await self._git.clone("-b", branch, url, str(self._repo_dir))

def is_excluded(self, notebook: Path) -> bool:
# A notebook is excluded if any of its parent directories are excluded
parents = set(notebook.parents)
excluded_parents = parents & self._exclude_paths
return len(excluded_parents) != 0

def find_notebooks(self) -> list[Path]:
with self.timings.start("find_notebooks"):
if self._repo_dir is None:
raise NotebookRepositoryError(
"Repository directory must be set", self.user.username
)
notebooks = [
p for p in self._repo_dir.iterdir() if p.suffix == ".ipynb"
n
for n in self._repo_dir.glob("**/*.ipynb")
if not self.is_excluded(n)
]
if not notebooks:
msg = "No notebooks found in {self._repo_dir}"
Expand Down
129 changes: 127 additions & 2 deletions tests/business/notebookrunner_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ async def test_run(
"options": {
"spawn_settle_time": 0,
"execution_idle_time": 0,
"max_executions": 1,
"max_executions": 10,
"repo_url": str(repo_path),
"repo_branch": "main",
"working_directory": str(repo_path),
Expand All @@ -76,7 +76,7 @@ async def test_run(
"business": {
"failure_count": 0,
"name": "NotebookRunner",
"notebook": "test-notebook.ipynb",
"notebook": ANY,
"success_count": 1,
"timings": ANY,
},
Expand All @@ -93,9 +93,134 @@ async def test_run(
# Get the log and check the cell output.
r = await client.get("/mobu/flocks/test/monkeys/testuser1/log")
assert r.status_code == 200

# Root notebook
assert "This is a test" in r.text
assert "This is another test" in r.text
assert "Final test" in r.text

# some-dir notebook
assert "Test some-dir" in r.text
assert "Another test some-dir" in r.text
assert "Final test some-dir" in r.text

# some-other-dir notebook
assert "Test some-other-dir" in r.text
assert "Another test some-other-dir" in r.text
assert "Final test some-other-dir" in r.text

# double-nested-dir notebook
assert "Test double-nested-dir" in r.text
assert "Another test double-nested-dir" in r.text
assert "Final test double-nested-dir" in r.text

# always excluded notebook
assert "This should never run." not in r.text

# Exceptions
assert "Exception thrown" not in r.text


@pytest.mark.asyncio
async def test_exclude_dirs(
client: AsyncClient, respx_mock: respx.Router, tmp_path: Path
) -> None:
mock_gafaelfawr(respx_mock)
cwd = Path.cwd()

# Set up a notebook repository.
source_path = Path(__file__).parent.parent / "notebooks"
repo_path = tmp_path / "notebooks"

shutil.copytree(str(source_path), str(repo_path))
# Remove exception notebook
(repo_path / "exception.ipynb").unlink()

# Set up git client
git = Git(repo=repo_path)
await git.init("--initial-branch=main")
await git.config("user.email", "[email protected]")
await git.config("user.name", "Git User")
for path in repo_path.iterdir():
if not path.name.startswith("."):
await git.add(str(path))
await git.commit("-m", "Initial commit")

# Start a monkey. We have to do this in a try/finally block since the
# runner will change working directories, which because working
# directories are process-global may mess up future tests.
try:
r = await client.put(
"/mobu/flocks",
json={
"name": "test",
"count": 1,
"user_spec": {"username_prefix": "testuser"},
"scopes": ["exec:notebook"],
"business": {
"type": "NotebookRunner",
"options": {
"spawn_settle_time": 0,
"execution_idle_time": 0,
"max_executions": 10,
"repo_url": str(repo_path),
"repo_branch": "main",
"working_directory": str(repo_path),
"exclude_dirs": [
"some-other-dir/nested-dir",
"some-dir",
],
},
},
},
)
assert r.status_code == 201

# Wait until we've finished one loop and check the results.
data = await wait_for_business(client, "testuser1")
assert data == {
"name": "testuser1",
"business": {
"failure_count": 0,
"name": "NotebookRunner",
"notebook": ANY,
"success_count": 1,
"timings": ANY,
},
"state": "RUNNING",
"user": {
"scopes": ["exec:notebook"],
"token": ANY,
"username": "testuser1",
},
}
finally:
os.chdir(cwd)

# Get the log and check the cell output.
r = await client.get("/mobu/flocks/test/monkeys/testuser1/log")
assert r.status_code == 200

# Root notebook
assert "This is a test" in r.text
assert "This is another test" in r.text
assert "Final test" in r.text

# some-other-dir notebook
assert "Test some-other-dir" in r.text
assert "Another test some-other-dir" in r.text
assert "Final test some-other-dir" in r.text

# some-dir notebook
assert "Test some-dir" not in r.text

# nested-dir notebook
assert "Test double-nested-dir" not in r.text

# always excluded notebook
assert "This should never run." not in r.text

# Exceptions
assert "Exception thrown" not in r.text


Expand Down
129 changes: 129 additions & 0 deletions tests/notebooks/mobu_exclude/test-always-excluded-notebook.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "5cb3e0b7",
"metadata": {},
"source": "This notebook should never get run because it is in an always-excluded directory."
},
{
"cell_type": "code",
"id": "f84f0959",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-20T14:45:05.038058Z",
"start_time": "2024-05-20T14:45:05.035661Z"
}
},
"source": "print(\"This should never run.\")",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"This should never run.\n"
]
}
],
"execution_count": 2
},
{
"cell_type": "code",
"id": "44ada997",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-20T14:45:05.073817Z",
"start_time": "2024-05-20T14:45:05.041797Z"
}
},
"source": [
"import os\n",
"print(\"This should never run either.\")"
],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"This should never run either.\n"
]
}
],
"execution_count": 3
},
{
"cell_type": "markdown",
"id": "d5cea3a7",
"metadata": {},
"source": [
"Another Markdown cell"
]
},
{
"cell_type": "markdown",
"id": "541caea0",
"metadata": {},
"source": [
"Yet another Markdown cell. This one has a list.\n",
"- one\n",
"- two\n",
"- three"
]
},
{
"cell_type": "code",
"id": "53a941a4",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-20T14:45:05.100321Z",
"start_time": "2024-05-20T14:45:05.074766Z"
}
},
"source": "print(\"Finally, this should never run\")",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Finally, this should never run\n"
]
}
],
"execution_count": 4
},
{
"cell_type": "code",
"id": "823560c6",
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-20T14:45:05.102973Z",
"start_time": "2024-05-20T14:45:05.101128Z"
}
},
"source": [],
"outputs": [],
"execution_count": 4
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit 047d465

Please sign in to comment.