Skip to content

Commit

Permalink
Do all notebook filtering in the business, not in the CI job
Browse files Browse the repository at this point in the history
* This is nice because it is easier to understand and less error prone
* This stinks because a job that didn't find any notebooks to run will look just like a job that successfully ran notebooks

I'd like to mark the check run as neutral in this case, but I think it
can wait until another PR.
  • Loading branch information
fajpunk committed Jul 19, 2024
1 parent 34f3d46 commit 386f218
Show file tree
Hide file tree
Showing 6 changed files with 247 additions and 161 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<!-- Delete the sections that don't apply -->

### Backwards-incompatible changes

- `exclude_dirs` from an in-repo `mobu.yaml` config file will overrided `exclude_dirs` in the Phalanx autostart config.
21 changes: 19 additions & 2 deletions src/mobu/services/business/notebookrunner.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,13 @@
from tempfile import TemporaryDirectory
from typing import Any

import yaml
from httpx import AsyncClient
from structlog.stdlib import BoundLogger

from mobu.constants import GITHUB_REPO_CONFIG_PATH
from mobu.models.repo import RepoConfig

from ...config import config
from ...exceptions import NotebookRepositoryError
from ...models.business.notebookrunner import (
Expand Down Expand Up @@ -84,10 +88,23 @@ async def initialize(self) -> None:
self._repo_dir = Path(TemporaryDirectory(delete=False).name)
await self.clone_repo()

self._exclude_paths = {
repo_config_path = self._repo_dir / GITHUB_REPO_CONFIG_PATH
if repo_config_path.exists():
repo_config = RepoConfig.model_validate(
yaml.safe_load(repo_config_path.read_text())
)
else:
repo_config = RepoConfig()

repo_exclude_paths = {
self._repo_dir / path for path in repo_config.exclude_dirs
}

config_exclude_paths = {
(self._repo_dir / path) for path in self.options.exclude_dirs
}
self._notebook_paths = self.find_notebooks()

self._exclude_paths = repo_exclude_paths or config_exclude_paths
self.logger.info("Repository cloned and ready")

async def shutdown(self) -> None:
Expand Down
36 changes: 1 addition & 35 deletions src/mobu/services/github_ci/ci_notebook_job.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,10 @@
"""GitHub CI checks for notebook repos."""

from pathlib import Path
from typing import Any

import yaml
from httpx import AsyncClient
from structlog.stdlib import BoundLogger

from mobu.constants import GITHUB_REPO_CONFIG_PATH
from mobu.exceptions import GitHubFileNotFoundError
from mobu.models.business.notebookrunner import (
NotebookRunnerConfig,
NotebookRunnerOptions,
Expand All @@ -18,7 +14,6 @@
from mobu.services.solitary import Solitary

from ...models.ci_manager import CiJobSummary
from ...models.repo import RepoConfig
from ...storage.gafaelfawr import GafaelfawrStorage
from ...storage.github import CheckRun, GitHubStorage

Expand Down Expand Up @@ -63,35 +58,10 @@ async def run(self, user: User, scopes: list[str]) -> None:
file in the repo. If there is no mobu config file, then don't exclude
any changed notebooks.
"""
# Get mobu config from repo, if it exists
exclude_dirs: set[Path] = set()
try:
raw_config = await self._github.get_file_content(
path=GITHUB_REPO_CONFIG_PATH
)
parsed_config: dict[str, Any] = yaml.safe_load(raw_config)
repo_config = RepoConfig.model_validate(parsed_config)
exclude_dirs = repo_config.exclude_dirs
except GitHubFileNotFoundError:
self._logger.debug("Mobu config file not found in repo")
except Exception as exc:
await self.check_run.fail(
error=(
"Error retreiving and parsing config file "
f"{GITHUB_REPO_CONFIG_PATH}: {exc!s}"
),
)
return

# Get changed notebook files
files = await self._github.get_changed_files()

self._notebooks = [
file
for file in files
if file.suffix == ".ipynb"
and not self._is_excluded(file, exclude_dirs)
]
self._notebooks = [file for file in files if file.suffix == ".ipynb"]

# Don't do anything if there are no notebooks to run
if not bool(self._notebooks):
Expand Down Expand Up @@ -131,10 +101,6 @@ async def run(self, user: User, scopes: list[str]) -> None:
else:
await self.check_run.fail(error=result.error or "Unknown Error")

def _is_excluded(self, notebook: Path, exclude_dirs: set[Path]) -> bool:
"""Exclude a notebook if any of its parent directories are excluded."""
return bool(set(notebook.parents) & exclude_dirs)

def summarize(self) -> CiJobSummary:
"""Information about this job."""
return CiJobSummary.model_validate(
Expand Down
223 changes: 222 additions & 1 deletion tests/business/notebookrunner_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ async def test_refresh(


@pytest.mark.asyncio
async def test_exclude_dirs(
async def test_config_exclude_dirs(
client: AsyncClient, respx_mock: respx.Router, tmp_path: Path
) -> None:
mock_gafaelfawr(respx_mock)
Expand Down Expand Up @@ -466,6 +466,227 @@ async def test_exclude_dirs(
assert "Done with this cycle of notebooks" in r.text


@pytest.mark.asyncio
async def test_repo_exclude_dirs(
client: AsyncClient, respx_mock: respx.Router, tmp_path: Path
) -> None:
"""exclude_dirs from in-repo config file should override flock config."""
mock_gafaelfawr(respx_mock)
cwd = Path.cwd()

# Set up a notebook repository.
source_path = TEST_DATA_DIR / "notebooks_recursive"
repo_path = tmp_path / "notebooks"

shutil.copytree(str(source_path), str(repo_path))
# Remove exception notebook
(repo_path / "exception.ipynb").unlink()

# Add a config file
(repo_path / "mobu.yaml").write_text('exclude_dirs: ["some-other-dir"]')

# Set up git repo
await setup_git_repo(repo_path)

# Start a monkey. We have to do this in a try/finally block since the
# runner will change working directories, which because working
# directories are process-global may mess up future tests.
try:
r = await client.put(
"/mobu/flocks",
json={
"name": "test",
"count": 1,
"user_spec": {"username_prefix": "bot-mobu-testuser"},
"scopes": ["exec:notebook"],
"business": {
"type": "NotebookRunner",
"options": {
"spawn_settle_time": 0,
"execution_idle_time": 0,
"max_executions": 2,
"repo_url": str(repo_path),
"repo_ref": "main",
"working_directory": str(repo_path),
"exclude_dirs": [
"some-other-dir/nested-dir",
"some-dir",
],
},
},
},
)
assert r.status_code == 201

# Wait until we've finished one loop and check the results.
data = await wait_for_business(client, "bot-mobu-testuser1")
assert data == {
"name": "bot-mobu-testuser1",
"business": {
"failure_count": 0,
"name": "NotebookRunner",
"notebook": ANY,
"refreshing": False,
"success_count": 1,
"timings": ANY,
},
"state": "RUNNING",
"user": {
"scopes": ["exec:notebook"],
"token": ANY,
"username": "bot-mobu-testuser1",
},
}
finally:
os.chdir(cwd)

# Get the log and check the cell output.
r = await client.get("/mobu/flocks/test/monkeys/bot-mobu-testuser1/log")
assert r.status_code == 200

# Root notebook
assert "This is a test" in r.text
assert "This is another test" in r.text
assert "Final test" in r.text

# some-other-dir notebook
assert "Test some-other-dir" not in r.text
assert "Another test some-other-dir" not in r.text
assert "Final test some-other-dir" not in r.text

# some-dir notebook
assert "Test some-dir" in r.text

# nested-dir notebook
assert "Test double-nested-dir" not in r.text

# Exceptions
assert "Exception thrown" not in r.text

# Make sure mobu ran all of the notebooks it thinks it should have
assert "Done with this cycle of notebooks" in r.text


@pytest.mark.asyncio
async def test_invalid_repo_config(
client: AsyncClient,
respx_mock: respx.Router,
tmp_path: Path,
slack: MockSlackWebhook,
) -> None:
mock_gafaelfawr(respx_mock)
cwd = Path.cwd()

# Set up a notebook repository.
source_path = TEST_DATA_DIR / "notebooks_recursive"
repo_path = tmp_path / "notebooks"

shutil.copytree(str(source_path), str(repo_path))
# Remove exception notebook
(repo_path / "exception.ipynb").unlink()

# Add a bad config file
(repo_path / "mobu.yaml").write_text("nope")

# Set up git repo
await setup_git_repo(repo_path)

# Start a monkey. We have to do this in a try/finally block since the
# runner will change working directories, which because working
# directories are process-global may mess up future tests.
try:
r = await client.put(
"/mobu/flocks",
json={
"name": "test",
"count": 1,
"user_spec": {"username_prefix": "bot-mobu-testuser"},
"scopes": ["exec:notebook"],
"business": {
"type": "NotebookRunner",
"options": {
"spawn_settle_time": 0,
"execution_idle_time": 0,
"max_executions": 2,
"repo_url": str(repo_path),
"repo_ref": "main",
"working_directory": str(repo_path),
"exclude_dirs": [
"some-other-dir/nested-dir",
"some-dir",
],
},
},
},
)
assert r.status_code == 201

# Wait until we've finished one loop and check the results.
data = await wait_for_business(client, "bot-mobu-testuser1")
assert data == {
"business": {
"failure_count": 1,
"name": "NotebookRunner",
"refreshing": False,
"success_count": 0,
"timings": ANY,
},
"name": "bot-mobu-testuser1",
"state": "STOPPING",
"user": {
"scopes": ["exec:notebook"],
"token": ANY,
"username": "bot-mobu-testuser1",
},
}
finally:
os.chdir(cwd)

# Make sure we sent a validation error in a Slack notification
assert slack.messages == [
{
"blocks": [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": ANY,
"verbatim": True,
},
},
{
"type": "section",
"fields": [
{
"type": "mrkdwn",
"text": "*Exception type*\nValidationError",
"verbatim": True,
},
{
"type": "mrkdwn",
"text": ANY,
"verbatim": True,
},
{
"type": "mrkdwn",
"text": "*Monkey*\ntest/bot-mobu-testuser1",
"verbatim": True,
},
{
"type": "mrkdwn",
"text": "*User*\nbot-mobu-testuser1",
"verbatim": True,
},
],
},
{"type": "divider"},
]
}
]
error = slack.messages[0]["blocks"][0]["text"]["text"]
assert "1 validation error" in error


@pytest.mark.asyncio
async def test_alert(
client: AsyncClient,
Expand Down
Loading

0 comments on commit 386f218

Please sign in to comment.