Skip to content

Commit

Permalink
Merge branch 'justin-the-law' into chore-add-nvidia-runtimeClass
Browse files Browse the repository at this point in the history
  • Loading branch information
gphorvath authored Jul 20, 2024
2 parents 1096c89 + ad83b0b commit a7e0505
Show file tree
Hide file tree
Showing 9 changed files with 327 additions and 1 deletion.
48 changes: 48 additions & 0 deletions .github/workflows/build-images.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
name: Build Images

on:
workflow_dispatch:
pull_request:
types:
- ready_for_review
- review_requested
- synchronize

permissions:
contents: read
packages: write

env:
K3S_TAG: ""
CUDA_TAG: ""

jobs:
build-and-publish-images:
runs-on: ubuntu-latest

steps:
- name: Checkout Repo
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1

- name: Read tags from config
id: read_tags
run: |
echo "K3S_TAG=$(jq -r .k3s_tag packages/k3d-gpu/version_config.json)" >> $GITHUB_ENV
echo "CUDA_TAG=$(jq -r .cuda_tag packages/k3d-gpu/version_config.json)" >> $GITHUB_ENV
- name: Login to GitHub Container Registry
uses: docker/login-action@343f7c4344506bcbf9b4de18042ae17996df046d # v3.0.0
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Build and Publish k3d-gpu
run: |
docker build \
--platform linux/amd64 \
--build-arg K3S_TAG=${{ env.K3S_TAG }} \
--build-arg CUDA_TAG=${{ env.CUDA_TAG }} \
-t ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:latest \
-f packages/k3d-gpu/Dockerfile .
docker push ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:latest
14 changes: 13 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
ARCH ?= amd64
KEY ?= ""
REG_PORT ?= 5000
REG_NAME ?= registry

VERSION ?= $(shell git describe --abbrev=0 --tags)
LOCAL_VERSION ?= $(shell git rev-parse --short HEAD)
Expand Down Expand Up @@ -33,7 +34,16 @@ gen-python: ## Generate the protobufs for the OpenAI typing within the leapfroga
src/leapfrogai_sdk/proto/leapfrogai_sdk/**/*.proto

local-registry: ## Start up a local container registry. Errors in this target are ignored.
-docker run -d -p ${REG_PORT}:5000 --restart=always --name registry registry:2
@echo "Creating local Docker registry..."
-@docker run -d -p ${REG_PORT}:5000 --restart=always --name ${REG_NAME} registry:2
@echo "Local registry created at localhost:${REG_PORT}"


# Clean up: Stop and remove the local registry
clean-registry:
@echo "Cleaning up..."
@docker stop ${REG_NAME}
@docker rm ${REG_NAME}

sdk-wheel: ## build wheels for the leapfrogai_sdk package as a dependency for other lfai components
docker build --platform=linux/${ARCH} -t ghcr.io/defenseunicorns/leapfrogai/leapfrogai-sdk:${LOCAL_VERSION} -f src/leapfrogai_sdk/Dockerfile .
Expand Down Expand Up @@ -151,3 +161,5 @@ build-gpu: build-supabase build-api build-ui build-vllm build-text-embeddings bu
build-all: build-cpu build-gpu ## Build all of the LFAI packages

include tests/make-tests.mk

include packages/k3d-gpu/Makefile
34 changes: 34 additions & 0 deletions packages/k3d-gpu/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
ARG K3S_TAG
ARG CUDA_TAG

FROM rancher/k3s:$K3S_TAG AS k3s
FROM nvidia/cuda:$CUDA_TAG

# Install the NVIDIA container toolkit
RUN apt-get update && apt-get install -y curl \
&& curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \
&& apt-get update && apt-get install -y nvidia-container-toolkit-base nvidia-container-toolkit nvidia-container-runtime util-linux \
&& nvidia-ctk runtime configure --runtime=containerd

COPY --from=k3s / / --exclude=/bin/
COPY --from=k3s /bin /bin

# Deploy the nvidia driver plugin on startup
COPY ./plugin/device-plugin-daemonset.yaml /var/lib/rancher/k3s/server/manifests/nvidia-device-plugin-daemonset.yaml

VOLUME /var/lib/kubelet
VOLUME /var/lib/rancher/k3s
VOLUME /var/lib/cni
VOLUME /var/log

# DIFF: resolve fsnotify issues
RUN sysctl -w fs.inotify.max_user_watches=100000
RUN sysctl -w fs.inotify.max_user_instances=100000

ENV PATH="$PATH:/bin/aux"

ENTRYPOINT ["/bin/k3s"]
CMD ["agent"]
11 changes: 11 additions & 0 deletions packages/k3d-gpu/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))

k3d-gpu-package:
@cd ${MAKEFILE_DIR} && \
uds zarf package create

create-uds-gpu-cluster:
@cd ${MAKEFILE_DIR} && \
uds zarf package deploy zarf-package-k3d-gpu-amd64-*.tar.zst --confirm

.PHONY: k3d-gpu-package create-uds-gpu-cluster
28 changes: 28 additions & 0 deletions packages/k3d-gpu/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# K3D GPU

Prepares `k3s` + `nvidia/cuda` base image that enables a K3D cluster to have access to your host machine's NVIDIA, CUDA-capable GPU(s).

## Pre-Requisites

* Docker: https://www.docker.com/
* K3D: https://k3d.io/
* UDS-CLI: https://github.com/defenseunicorns/uds-cli
* Modern NVIDIA GPU with CUDA cores and drivers must be present. Additionally, the CUDA toolkit and NVIDIA container toolkit must be installed.

## Usage

Check out the Make targets for the various options.

### Local

```shell
make push-k3d-gpu # build and push image to a local registry

make uds-gpu-cluster # create a uds cluster equipped with the k3d-gpu image

make test-k3d-gpu # deploy a test gpu pod to see if everything is working
```

## References

* https://k3d.io/v5.7.2/usage/advanced/cuda/
61 changes: 61 additions & 0 deletions packages/k3d-gpu/plugin/device-plugin-daemonset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
apiVersion: node.k8s.io/v1
kind: RuntimeClass
metadata:
name: nvidia
handler: nvidia
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-device-plugin-daemonset
namespace: kube-system
spec:
selector:
matchLabels:
name: nvidia-device-plugin-daemonset
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: nvidia-device-plugin-daemonset
spec:
runtimeClassName: nvidia # Explicitly request the runtime
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
containers:
- image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.2
name: nvidia-device-plugin-ctr
env:
- name: PASS_DEVICE_SPECS
value: "true"
- name: FAIL_ON_INIT_ERROR
value: "true"
- name: DEVICE_LIST_STRATEGY
value: envvar
- name: DEVICE_ID_STRATEGY
value: uuid
- name: NVIDIA_VISIBLE_DEVICES
value: all
- name: NVIDIA_DRIVER_CAPABILITIES
value: all
- name: MPS_ROOT
value: /run/nvidia/mps
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: ["ALL"]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
21 changes: 21 additions & 0 deletions packages/k3d-gpu/test/cuda-vector-add.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
apiVersion: v1
kind: Pod
metadata:
name: gpu-pod
labels:
app: gpu-pod
spec:
runtimeClassName: nvidia
restartPolicy: Never
containers:
- name: cuda-container
image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda10.2
resources:
limits:
nvidia.com/gpu: "1" # requesting 1 GPU
cpu: "1"
memory: 0.5Gi
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
7 changes: 7 additions & 0 deletions packages/k3d-gpu/zarf-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package:
create:
set:
# x-release-please-start-version
version: 0.9.1
# x-release-please-end
reg_name: registry
104 changes: 104 additions & 0 deletions packages/k3d-gpu/zarf.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/defenseunicorns/zarf/main/zarf.schema.json

kind: ZarfPackageConfig
metadata:
name: "k3d-gpu"
version: '###ZARF_PKG_TMPL_VERSION###'
description: >
k3d base image with GPU support
variables:
- name: REG_PORT
description: "Local registry port"
default: "5000"
- name: UDS_CORE
description: "UDS Core version to use"
default: "k3d-core-slim-dev:0.24.0"
- name: K3S_TAG
description: "K3s version to use"
default: "v1.28.8-k3s1"
- name: CUDA_TAG
description: "CUDA version to use"
default: "12.4.1-base-ubuntu22.04"

components:
- name: create-local-registry
required: true
actions:
onDeploy:
before:
- cmd: |
set +e
docker run -d --name ###ZARF_PKG_TMPL_REG_NAME### -p ${ZARF_VAR_REG_PORT}:5000 registry:2
set -e
description: "Start the local registry"
onRemove:
before:
- cmd: |
set +e
docker stop ###ZARF_PKG_TMPL_REG_NAME###
docker rm ###ZARF_PKG_TMPL_REG_NAME###
set -e
description: "Stop and remove the local registry"
- name: build-image
required: true
files:
- source: Dockerfile
target: Dockerfile
- source: plugin/device-plugin-daemonset.yaml
target: plugin/device-plugin-daemonset.yaml
actions:
onDeploy:
before:
- cmd: |
docker build \
--platform linux/amd64 \
--build-arg K3S_TAG=${ZARF_VAR_K3S_TAG} \
--build-arg CUDA_TAG=${ZARF_VAR_CUDA_TAG} \
-t ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION### \
-f ./Dockerfile .
docker tag \
ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION### \
localhost:${ZARF_VAR_REG_PORT}/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION###
description: "Build the k3d-gpu image"
after:
- cmd:
docker push localhost:${ZARF_VAR_REG_PORT}/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION###
description: "Push the image to the local registry"

- name: create-cluster
required: true
actions:
onDeploy:
before:
- cmd: |
uds deploy ${ZARF_VAR_UDS_CORE} \
--set K3D_EXTRA_ARGS="--gpus=all \
--image=localhost:${ZARF_VAR_REG_PORT}/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION###" \
--no-progress --insecure --confirm
description: "Create a k3d cluster with GPU support"
onRemove:
before:
- cmd: |
k3d cluster delete uds
description: "Delete the k3d cluster"
- name: test-cluster
required: true
files:
- source: test/cuda-vector-add.yaml
target: test/cuda-vector-add.yaml
actions:
onDeploy:
before:
- cmd: |
uds zarf tools kubectl apply -f ./test/cuda-vector-add.yaml
uds zarf tools kubectl wait --for=jsonpath='{.status.phase}'=Succeeded --timeout=15s pod -l app=gpu-pod
uds zarf tools kubectl logs -l app=gpu-pod
description: "Run the test pod"
after:
- cmd: |
uds zarf tools kubectl delete -f ./test/cuda-vector-add.yaml
description: "Delete the test pod"

0 comments on commit a7e0505

Please sign in to comment.