-
Notifications
You must be signed in to change notification settings - Fork 30
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'justin-the-law' into chore-add-nvidia-runtimeClass
- Loading branch information
Showing
9 changed files
with
327 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
name: Build Images | ||
|
||
on: | ||
workflow_dispatch: | ||
pull_request: | ||
types: | ||
- ready_for_review | ||
- review_requested | ||
- synchronize | ||
|
||
permissions: | ||
contents: read | ||
packages: write | ||
|
||
env: | ||
K3S_TAG: "" | ||
CUDA_TAG: "" | ||
|
||
jobs: | ||
build-and-publish-images: | ||
runs-on: ubuntu-latest | ||
|
||
steps: | ||
- name: Checkout Repo | ||
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 | ||
|
||
- name: Read tags from config | ||
id: read_tags | ||
run: | | ||
echo "K3S_TAG=$(jq -r .k3s_tag packages/k3d-gpu/version_config.json)" >> $GITHUB_ENV | ||
echo "CUDA_TAG=$(jq -r .cuda_tag packages/k3d-gpu/version_config.json)" >> $GITHUB_ENV | ||
- name: Login to GitHub Container Registry | ||
uses: docker/login-action@343f7c4344506bcbf9b4de18042ae17996df046d # v3.0.0 | ||
with: | ||
registry: ghcr.io | ||
username: ${{ github.actor }} | ||
password: ${{ secrets.GITHUB_TOKEN }} | ||
|
||
- name: Build and Publish k3d-gpu | ||
run: | | ||
docker build \ | ||
--platform linux/amd64 \ | ||
--build-arg K3S_TAG=${{ env.K3S_TAG }} \ | ||
--build-arg CUDA_TAG=${{ env.CUDA_TAG }} \ | ||
-t ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:latest \ | ||
-f packages/k3d-gpu/Dockerfile . | ||
docker push ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:latest |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
ARG K3S_TAG | ||
ARG CUDA_TAG | ||
|
||
FROM rancher/k3s:$K3S_TAG AS k3s | ||
FROM nvidia/cuda:$CUDA_TAG | ||
|
||
# Install the NVIDIA container toolkit | ||
RUN apt-get update && apt-get install -y curl \ | ||
&& curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ | ||
&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ | ||
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ | ||
tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \ | ||
&& apt-get update && apt-get install -y nvidia-container-toolkit-base nvidia-container-toolkit nvidia-container-runtime util-linux \ | ||
&& nvidia-ctk runtime configure --runtime=containerd | ||
|
||
COPY --from=k3s / / --exclude=/bin/ | ||
COPY --from=k3s /bin /bin | ||
|
||
# Deploy the nvidia driver plugin on startup | ||
COPY ./plugin/device-plugin-daemonset.yaml /var/lib/rancher/k3s/server/manifests/nvidia-device-plugin-daemonset.yaml | ||
|
||
VOLUME /var/lib/kubelet | ||
VOLUME /var/lib/rancher/k3s | ||
VOLUME /var/lib/cni | ||
VOLUME /var/log | ||
|
||
# DIFF: resolve fsnotify issues | ||
RUN sysctl -w fs.inotify.max_user_watches=100000 | ||
RUN sysctl -w fs.inotify.max_user_instances=100000 | ||
|
||
ENV PATH="$PATH:/bin/aux" | ||
|
||
ENTRYPOINT ["/bin/k3s"] | ||
CMD ["agent"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) | ||
|
||
k3d-gpu-package: | ||
@cd ${MAKEFILE_DIR} && \ | ||
uds zarf package create | ||
|
||
create-uds-gpu-cluster: | ||
@cd ${MAKEFILE_DIR} && \ | ||
uds zarf package deploy zarf-package-k3d-gpu-amd64-*.tar.zst --confirm | ||
|
||
.PHONY: k3d-gpu-package create-uds-gpu-cluster |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
# K3D GPU | ||
|
||
Prepares `k3s` + `nvidia/cuda` base image that enables a K3D cluster to have access to your host machine's NVIDIA, CUDA-capable GPU(s). | ||
|
||
## Pre-Requisites | ||
|
||
* Docker: https://www.docker.com/ | ||
* K3D: https://k3d.io/ | ||
* UDS-CLI: https://github.com/defenseunicorns/uds-cli | ||
* Modern NVIDIA GPU with CUDA cores and drivers must be present. Additionally, the CUDA toolkit and NVIDIA container toolkit must be installed. | ||
|
||
## Usage | ||
|
||
Check out the Make targets for the various options. | ||
|
||
### Local | ||
|
||
```shell | ||
make push-k3d-gpu # build and push image to a local registry | ||
|
||
make uds-gpu-cluster # create a uds cluster equipped with the k3d-gpu image | ||
|
||
make test-k3d-gpu # deploy a test gpu pod to see if everything is working | ||
``` | ||
|
||
## References | ||
|
||
* https://k3d.io/v5.7.2/usage/advanced/cuda/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
apiVersion: node.k8s.io/v1 | ||
kind: RuntimeClass | ||
metadata: | ||
name: nvidia | ||
handler: nvidia | ||
--- | ||
apiVersion: apps/v1 | ||
kind: DaemonSet | ||
metadata: | ||
name: nvidia-device-plugin-daemonset | ||
namespace: kube-system | ||
spec: | ||
selector: | ||
matchLabels: | ||
name: nvidia-device-plugin-daemonset | ||
updateStrategy: | ||
type: RollingUpdate | ||
template: | ||
metadata: | ||
labels: | ||
name: nvidia-device-plugin-daemonset | ||
spec: | ||
runtimeClassName: nvidia # Explicitly request the runtime | ||
tolerations: | ||
- key: nvidia.com/gpu | ||
operator: Exists | ||
effect: NoSchedule | ||
# Mark this pod as a critical add-on; when enabled, the critical add-on | ||
# scheduler reserves resources for critical add-on pods so that they can | ||
# be rescheduled after a failure. | ||
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ | ||
priorityClassName: "system-node-critical" | ||
containers: | ||
- image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.2 | ||
name: nvidia-device-plugin-ctr | ||
env: | ||
- name: PASS_DEVICE_SPECS | ||
value: "true" | ||
- name: FAIL_ON_INIT_ERROR | ||
value: "true" | ||
- name: DEVICE_LIST_STRATEGY | ||
value: envvar | ||
- name: DEVICE_ID_STRATEGY | ||
value: uuid | ||
- name: NVIDIA_VISIBLE_DEVICES | ||
value: all | ||
- name: NVIDIA_DRIVER_CAPABILITIES | ||
value: all | ||
- name: MPS_ROOT | ||
value: /run/nvidia/mps | ||
securityContext: | ||
allowPrivilegeEscalation: false | ||
capabilities: | ||
drop: ["ALL"] | ||
volumeMounts: | ||
- name: device-plugin | ||
mountPath: /var/lib/kubelet/device-plugins | ||
volumes: | ||
- name: device-plugin | ||
hostPath: | ||
path: /var/lib/kubelet/device-plugins |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
apiVersion: v1 | ||
kind: Pod | ||
metadata: | ||
name: gpu-pod | ||
labels: | ||
app: gpu-pod | ||
spec: | ||
runtimeClassName: nvidia | ||
restartPolicy: Never | ||
containers: | ||
- name: cuda-container | ||
image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda10.2 | ||
resources: | ||
limits: | ||
nvidia.com/gpu: "1" # requesting 1 GPU | ||
cpu: "1" | ||
memory: 0.5Gi | ||
tolerations: | ||
- key: nvidia.com/gpu | ||
operator: Exists | ||
effect: NoSchedule |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
package: | ||
create: | ||
set: | ||
# x-release-please-start-version | ||
version: 0.9.1 | ||
# x-release-please-end | ||
reg_name: registry |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
# yaml-language-server: $schema=https://raw.githubusercontent.com/defenseunicorns/zarf/main/zarf.schema.json | ||
|
||
kind: ZarfPackageConfig | ||
metadata: | ||
name: "k3d-gpu" | ||
version: '###ZARF_PKG_TMPL_VERSION###' | ||
description: > | ||
k3d base image with GPU support | ||
variables: | ||
- name: REG_PORT | ||
description: "Local registry port" | ||
default: "5000" | ||
- name: UDS_CORE | ||
description: "UDS Core version to use" | ||
default: "k3d-core-slim-dev:0.24.0" | ||
- name: K3S_TAG | ||
description: "K3s version to use" | ||
default: "v1.28.8-k3s1" | ||
- name: CUDA_TAG | ||
description: "CUDA version to use" | ||
default: "12.4.1-base-ubuntu22.04" | ||
|
||
components: | ||
- name: create-local-registry | ||
required: true | ||
actions: | ||
onDeploy: | ||
before: | ||
- cmd: | | ||
set +e | ||
docker run -d --name ###ZARF_PKG_TMPL_REG_NAME### -p ${ZARF_VAR_REG_PORT}:5000 registry:2 | ||
set -e | ||
description: "Start the local registry" | ||
onRemove: | ||
before: | ||
- cmd: | | ||
set +e | ||
docker stop ###ZARF_PKG_TMPL_REG_NAME### | ||
docker rm ###ZARF_PKG_TMPL_REG_NAME### | ||
set -e | ||
description: "Stop and remove the local registry" | ||
- name: build-image | ||
required: true | ||
files: | ||
- source: Dockerfile | ||
target: Dockerfile | ||
- source: plugin/device-plugin-daemonset.yaml | ||
target: plugin/device-plugin-daemonset.yaml | ||
actions: | ||
onDeploy: | ||
before: | ||
- cmd: | | ||
docker build \ | ||
--platform linux/amd64 \ | ||
--build-arg K3S_TAG=${ZARF_VAR_K3S_TAG} \ | ||
--build-arg CUDA_TAG=${ZARF_VAR_CUDA_TAG} \ | ||
-t ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION### \ | ||
-f ./Dockerfile . | ||
docker tag \ | ||
ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION### \ | ||
localhost:${ZARF_VAR_REG_PORT}/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION### | ||
description: "Build the k3d-gpu image" | ||
after: | ||
- cmd: | ||
docker push localhost:${ZARF_VAR_REG_PORT}/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION### | ||
description: "Push the image to the local registry" | ||
|
||
- name: create-cluster | ||
required: true | ||
actions: | ||
onDeploy: | ||
before: | ||
- cmd: | | ||
uds deploy ${ZARF_VAR_UDS_CORE} \ | ||
--set K3D_EXTRA_ARGS="--gpus=all \ | ||
--image=localhost:${ZARF_VAR_REG_PORT}/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION###" \ | ||
--no-progress --insecure --confirm | ||
description: "Create a k3d cluster with GPU support" | ||
onRemove: | ||
before: | ||
- cmd: | | ||
k3d cluster delete uds | ||
description: "Delete the k3d cluster" | ||
- name: test-cluster | ||
required: true | ||
files: | ||
- source: test/cuda-vector-add.yaml | ||
target: test/cuda-vector-add.yaml | ||
actions: | ||
onDeploy: | ||
before: | ||
- cmd: | | ||
uds zarf tools kubectl apply -f ./test/cuda-vector-add.yaml | ||
uds zarf tools kubectl wait --for=jsonpath='{.status.phase}'=Succeeded --timeout=15s pod -l app=gpu-pod | ||
uds zarf tools kubectl logs -l app=gpu-pod | ||
description: "Run the test pod" | ||
after: | ||
- cmd: | | ||
uds zarf tools kubectl delete -f ./test/cuda-vector-add.yaml | ||
description: "Delete the test pod" |