diff --git a/content/docs/ref/runner.md b/content/docs/ref/runner.md index 7cbe17e2..6f099536 100644 --- a/content/docs/ref/runner.md +++ b/content/docs/ref/runner.md @@ -78,6 +78,15 @@ Any [generic option](/doc/ref) in addition to: need to write your code to save intermediate results to take advantage of this). +### Bitbucket + +- **GPU support**. + + See + [the guide on self-hosted Bitbucket runners](/doc/self-hosted-runners?tab=Bitbucket) + to work around + [Bitbucket's lack of native GPU support](https://jira.atlassian.com/browse/BCLOUD-21459). + ## Examples ### Using `--cloud-permission-set` diff --git a/content/docs/self-hosted-runners.md b/content/docs/self-hosted-runners.md index 3a6b5cfb..9fa5b876 100644 --- a/content/docs/self-hosted-runners.md +++ b/content/docs/self-hosted-runners.md @@ -116,7 +116,7 @@ train-and-report: ``` - + ```yaml pipelines: @@ -134,7 +134,6 @@ pipelines: - step: runs-on: [self.hosted, cml.runner] image: iterativeai/cml:0-dvc2-base1 - # GPU not yet supported, see https://github.com/iterative/cml/issues/1015 script: - pip install -r requirements.txt - python train.py # generate plot.png @@ -144,6 +143,56 @@ pipelines: - cml comment create report.md ``` + + + +Bitbucket does not support GPUs natively +([cml#1015](https://github.com/iterative/cml/issues/1015), +[BCLOUD-21459](https://jira.atlassian.com/browse/BCLOUD-21459)). A work-around +is to directly use +[TPI](https://github.com/iterative/terraform-provider-iterative) (the library +which CML `runner` uses internally). TPI includes a CLI-friendly helper called +LEO (launch, execute, orchestrate), used below: + +```yaml +image: iterativeai/cml:0-dvc2-base1 +pipelines: + default: + - step: + name: Launch Runner and Train + script: + # Create training script + - | + cat < leo-script.sh + #!/bin/bash + apt-get update -q && apt-get install -yq python3.9 + pip3 install -r requirements.txt + python train.py # generate plot.png + EOF + # Launch runner + - | + LEO_OPTIONS="--cloud=aws --region=us-west" + leo_id=$(leo create $LEO_OPTIONS \ + --image=nvidia + --machine=p2.xlarge \ + --disk-size=64 \ + --workdir=. \ + --output=. \ + --environment AWS_ACCESS_KEY_ID="$AWS_ACCESS_KEY_ID" \ + --environment AWS_SECRET_ACCESS_KEY="$AWS_SECRET_ACCESS_KEY" \ + --script="$(cat ./leo-script.sh)" + ) + # Wait for cloud training to finish + leo read $LEO_OPTIONS --follow "$leo_id" + sleep 45 # TODO: explain + # Download cloud training results & clean up cloud resources + leo delete $LEO_OPTIONS --workdir=. --output=. "$leo_id" + # Create CML report + - cat metrics.txt >> report.md + - echo '![](./plot.png "Confusion Matrix")' >> report.md + - cml comment create report.md +``` + diff --git a/src/components/pages/Home/UseCasesSection/index.tsx b/src/components/pages/Home/UseCasesSection/index.tsx index 9f96448f..999a900b 100644 --- a/src/components/pages/Home/UseCasesSection/index.tsx +++ b/src/components/pages/Home/UseCasesSection/index.tsx @@ -673,6 +673,8 @@ const UseCasesSection: React.ForwardRefRenderFunction = () => ( bitbucket={( +
# Use LEO instead of CML to force GPU support on Bitbucket
+
# (https://cml.dev/doc/ref/runner#bitbucket)
image: iterativeai/cml:0-dvc2-base1
pipelines:
default:
@@ -705,7 +707,7 @@ const UseCasesSection: React.ForwardRefRenderFunction = () => (
--script="$(cat ./leo-script.sh)"
)
leo read $LEO_OPTIONS --follow "$leo_id"
-
sleep 45 # TODO: replace this hack with a proper wait loop
+
sleep 45 # TODO: explain
leo delete $LEO_OPTIONS --workdir="." --output="." \
"$leo_id"