Skip to content

Commit

Permalink
Add retries field to BeakerLaunchConfig (#79)
Browse files Browse the repository at this point in the history
  • Loading branch information
epwalsh authored Nov 4, 2024
1 parent 29e1276 commit b9351e2
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 2 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## Unreleased

### Added

- Added `retries` field to `BeakerLaunchConfig`.

## [v1.6.0](https://github.com/allenai/OLMo-core/releases/tag/v1.6.0) - 2024-11-01

### Added
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ dev = [
"sphinx-autodoc-typehints==1.23.3",
]
beaker = [
"beaker-py",
"beaker-py>=1.32.0",
"GitPython>=3.0,<4.0",
]
wandb = [
Expand Down
13 changes: 12 additions & 1 deletion src/olmo_core/launch/beaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
ExperimentSpec,
Job,
Priority,
RetrySpec,
TaskResources,
TaskSpec,
)
Expand Down Expand Up @@ -175,6 +176,11 @@ class BeakerLaunchConfig(Config):
If the job should be preemptible.
"""

retries: Optional[int] = None
"""
The number of times to retry the experiment if it fails.
"""

env_vars: List[BeakerEnvVar] = field(default_factory=list)
"""
Additional env vars to include.
Expand Down Expand Up @@ -360,7 +366,12 @@ def build_experiment_spec(self, torchrun: bool = True) -> ExperimentSpec:
for bucket in self.weka_buckets:
task_spec = task_spec.with_dataset(bucket.mount, weka=bucket.bucket)

return ExperimentSpec(description=self.description, budget=self.budget, tasks=[task_spec])
return ExperimentSpec(
description=self.description,
budget=self.budget,
tasks=[task_spec],
retry=None if not self.retries else RetrySpec(allowed_task_retries=self.retries),
)

def _follow_experiment(self, experiment: Experiment):
# Wait for job to start...
Expand Down

0 comments on commit b9351e2

Please sign in to comment.