Merge branch 'justin-the-law' into chore-add-nvidia-runtimeClass

defenseunicorns · Jul 20, 2024 · a7e0505 · a7e0505
2 parents 1096c89 + ad83b0b
commit a7e0505
Show file tree

Hide file tree

Showing 9 changed files with 327 additions and 1 deletion.
diff --git a/.github/workflows/build-images.yaml b/.github/workflows/build-images.yaml
@@ -0,0 +1,48 @@
+name: Build Images
+
+on:
+  workflow_dispatch:
+  pull_request:
+    types:
+      - ready_for_review
+      - review_requested
+      - synchronize
+
+permissions:
+  contents: read
+  packages: write
+
+env:
+  K3S_TAG: ""
+  CUDA_TAG: ""
+
+jobs:
+  build-and-publish-images:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout Repo
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+
+      - name: Read tags from config
+        id: read_tags
+        run: |
+          echo "K3S_TAG=$(jq -r .k3s_tag packages/k3d-gpu/version_config.json)" >> $GITHUB_ENV
+          echo "CUDA_TAG=$(jq -r .cuda_tag packages/k3d-gpu/version_config.json)" >> $GITHUB_ENV
+
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@343f7c4344506bcbf9b4de18042ae17996df046d # v3.0.0
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build and Publish k3d-gpu
+        run: |
+          docker build \
+            --platform linux/amd64 \
+            --build-arg K3S_TAG=${{ env.K3S_TAG }} \
+            --build-arg CUDA_TAG=${{ env.CUDA_TAG }} \
+            -t ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:latest \
+            -f packages/k3d-gpu/Dockerfile .
+          docker push ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:latest
diff --git a/Makefile b/Makefile
@@ -1,6 +1,7 @@
 ARCH ?= amd64
 KEY ?= ""
 REG_PORT ?= 5000
+REG_NAME ?= registry
 
 VERSION ?= $(shell git describe --abbrev=0 --tags)
 LOCAL_VERSION ?= $(shell git rev-parse --short HEAD)
@@ -33,7 +34,16 @@ gen-python: ## Generate the protobufs for the OpenAI typing within the leapfroga
 			src/leapfrogai_sdk/proto/leapfrogai_sdk/**/*.proto
 
 local-registry: ## Start up a local container registry. Errors in this target are ignored.
-	-docker run -d -p ${REG_PORT}:5000 --restart=always --name registry registry:2
+	@echo "Creating local Docker registry..."
+	-@docker run -d -p ${REG_PORT}:5000 --restart=always --name ${REG_NAME} registry:2
+	@echo "Local registry created at localhost:${REG_PORT}"
+
+
+# Clean up: Stop and remove the local registry
+clean-registry:
+	@echo "Cleaning up..."
+	@docker stop ${REG_NAME}
+	@docker rm ${REG_NAME}
 
 sdk-wheel: ## build wheels for the leapfrogai_sdk package as a dependency for other lfai components
 	docker build --platform=linux/${ARCH} -t ghcr.io/defenseunicorns/leapfrogai/leapfrogai-sdk:${LOCAL_VERSION} -f src/leapfrogai_sdk/Dockerfile .
@@ -151,3 +161,5 @@ build-gpu: build-supabase build-api build-ui build-vllm build-text-embeddings bu
 build-all: build-cpu build-gpu ## Build all of the LFAI packages
 
 include tests/make-tests.mk
+
+include packages/k3d-gpu/Makefile
diff --git a/packages/k3d-gpu/Dockerfile b/packages/k3d-gpu/Dockerfile
@@ -0,0 +1,34 @@
+ARG K3S_TAG
+ARG CUDA_TAG
+
+FROM rancher/k3s:$K3S_TAG AS k3s
+FROM nvidia/cuda:$CUDA_TAG
+
+# Install the NVIDIA container toolkit
+RUN apt-get update && apt-get install -y curl \
+    && curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
+    && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
+      sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
+      tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \
+    && apt-get update && apt-get install -y nvidia-container-toolkit-base nvidia-container-toolkit nvidia-container-runtime util-linux \
+    && nvidia-ctk runtime configure --runtime=containerd
+
+COPY --from=k3s / / --exclude=/bin/
+COPY --from=k3s /bin /bin
+
+# Deploy the nvidia driver plugin on startup
+COPY ./plugin/device-plugin-daemonset.yaml /var/lib/rancher/k3s/server/manifests/nvidia-device-plugin-daemonset.yaml
+
+VOLUME /var/lib/kubelet
+VOLUME /var/lib/rancher/k3s
+VOLUME /var/lib/cni
+VOLUME /var/log
+
+# DIFF: resolve fsnotify issues
+RUN sysctl -w fs.inotify.max_user_watches=100000
+RUN sysctl -w fs.inotify.max_user_instances=100000
+
+ENV PATH="$PATH:/bin/aux"
+
+ENTRYPOINT ["/bin/k3s"]
+CMD ["agent"]
diff --git a/packages/k3d-gpu/Makefile b/packages/k3d-gpu/Makefile
@@ -0,0 +1,11 @@
+MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
+
+k3d-gpu-package:
+	@cd ${MAKEFILE_DIR} && \
+	uds zarf package create
+
+create-uds-gpu-cluster:
+	@cd ${MAKEFILE_DIR} && \
+	uds zarf package deploy zarf-package-k3d-gpu-amd64-*.tar.zst --confirm
+
+.PHONY: k3d-gpu-package create-uds-gpu-cluster
diff --git a/packages/k3d-gpu/README.md b/packages/k3d-gpu/README.md
@@ -0,0 +1,28 @@
+# K3D GPU
+
+Prepares `k3s` + `nvidia/cuda` base image that enables a K3D cluster to have access to your host machine's NVIDIA, CUDA-capable GPU(s).
+
+## Pre-Requisites
+
+* Docker: https://www.docker.com/
+* K3D: https://k3d.io/
+* UDS-CLI: https://github.com/defenseunicorns/uds-cli
+* Modern NVIDIA GPU with CUDA cores and drivers must be present. Additionally, the CUDA toolkit and NVIDIA container toolkit must be installed.
+
+## Usage
+
+Check out the Make targets for the various options.
+
+### Local
+
+```shell
+make push-k3d-gpu # build and push image to a local registry
+
+make uds-gpu-cluster # create a uds cluster equipped with the k3d-gpu image
+
+make test-k3d-gpu # deploy a test gpu pod to see if everything is working
+```
+
+## References
+
+* https://k3d.io/v5.7.2/usage/advanced/cuda/
diff --git a/packages/k3d-gpu/plugin/device-plugin-daemonset.yaml b/packages/k3d-gpu/plugin/device-plugin-daemonset.yaml
@@ -0,0 +1,61 @@
+apiVersion: node.k8s.io/v1
+kind: RuntimeClass
+metadata:
+  name: nvidia
+handler: nvidia
+---
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: nvidia-device-plugin-daemonset
+  namespace: kube-system
+spec:
+  selector:
+    matchLabels:
+      name: nvidia-device-plugin-daemonset
+  updateStrategy:
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        name: nvidia-device-plugin-daemonset
+    spec:
+      runtimeClassName: nvidia # Explicitly request the runtime
+      tolerations:
+      - key: nvidia.com/gpu
+        operator: Exists
+        effect: NoSchedule
+      # Mark this pod as a critical add-on; when enabled, the critical add-on
+      # scheduler reserves resources for critical add-on pods so that they can
+      # be rescheduled after a failure.
+      # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
+      priorityClassName: "system-node-critical"
+      containers:
+      - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.2
+        name: nvidia-device-plugin-ctr
+        env:
+          - name: PASS_DEVICE_SPECS
+            value: "true"
+          - name: FAIL_ON_INIT_ERROR
+            value: "true"
+          - name: DEVICE_LIST_STRATEGY
+            value: envvar
+          - name: DEVICE_ID_STRATEGY
+            value: uuid
+          - name: NVIDIA_VISIBLE_DEVICES
+            value: all
+          - name: NVIDIA_DRIVER_CAPABILITIES
+            value: all
+          - name: MPS_ROOT
+            value: /run/nvidia/mps
+        securityContext:
+          allowPrivilegeEscalation: false
+          capabilities:
+            drop: ["ALL"]
+        volumeMounts:
+        - name: device-plugin
+          mountPath: /var/lib/kubelet/device-plugins
+      volumes:
+      - name: device-plugin
+        hostPath:
+          path: /var/lib/kubelet/device-plugins
diff --git a/packages/k3d-gpu/test/cuda-vector-add.yaml b/packages/k3d-gpu/test/cuda-vector-add.yaml
@@ -0,0 +1,21 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: gpu-pod
+  labels:
+    app: gpu-pod
+spec:
+  runtimeClassName: nvidia
+  restartPolicy: Never
+  containers:
+    - name: cuda-container
+      image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda10.2
+      resources:
+        limits:
+          nvidia.com/gpu: "1" # requesting 1 GPU
+          cpu: "1"
+          memory: 0.5Gi
+  tolerations:
+  - key: nvidia.com/gpu
+    operator: Exists
+    effect: NoSchedule
diff --git a/packages/k3d-gpu/zarf-config.yaml b/packages/k3d-gpu/zarf-config.yaml
@@ -0,0 +1,7 @@
+package:
+  create:
+    set:
+      # x-release-please-start-version
+      version: 0.9.1
+      # x-release-please-end
+      reg_name: registry
diff --git a/packages/k3d-gpu/zarf.yaml b/packages/k3d-gpu/zarf.yaml
@@ -0,0 +1,104 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/defenseunicorns/zarf/main/zarf.schema.json
+
+kind: ZarfPackageConfig
+metadata:
+  name: "k3d-gpu"
+  version: '###ZARF_PKG_TMPL_VERSION###'
+  description: >
+    k3d base image with GPU support
+
+variables:
+  - name: REG_PORT
+    description: "Local registry port"
+    default: "5000"
+  - name: UDS_CORE
+    description: "UDS Core version to use"
+    default: "k3d-core-slim-dev:0.24.0"
+  - name: K3S_TAG
+    description: "K3s version to use"
+    default: "v1.28.8-k3s1"
+  - name: CUDA_TAG
+    description: "CUDA version to use"
+    default: "12.4.1-base-ubuntu22.04"
+
+components:
+  - name: create-local-registry
+    required: true
+    actions:
+      onDeploy:
+        before:
+          - cmd: |
+              set +e
+              docker run -d --name ###ZARF_PKG_TMPL_REG_NAME### -p ${ZARF_VAR_REG_PORT}:5000 registry:2
+              set -e
+            description: "Start the local registry"
+      onRemove:
+        before:
+          - cmd: |
+              set +e
+              docker stop ###ZARF_PKG_TMPL_REG_NAME###
+              docker rm ###ZARF_PKG_TMPL_REG_NAME###
+              set -e
+            description: "Stop and remove the local registry"
+
+  - name: build-image
+    required: true
+    files:
+      - source: Dockerfile
+        target: Dockerfile
+      - source: plugin/device-plugin-daemonset.yaml
+        target: plugin/device-plugin-daemonset.yaml
+    actions:
+      onDeploy:
+        before:
+          - cmd: |
+              docker build \
+                --platform linux/amd64 \
+                --build-arg K3S_TAG=${ZARF_VAR_K3S_TAG} \
+                --build-arg CUDA_TAG=${ZARF_VAR_CUDA_TAG} \
+                -t ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION### \
+                -f ./Dockerfile .
+
+              docker tag \
+                ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION### \
+                localhost:${ZARF_VAR_REG_PORT}/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION###
+            description: "Build the k3d-gpu image"
+        after:
+          - cmd:
+              docker push localhost:${ZARF_VAR_REG_PORT}/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION###
+            description: "Push the image to the local registry"
+
+  - name: create-cluster
+    required: true
+    actions:
+      onDeploy:
+        before:
+          - cmd: |
+              uds deploy ${ZARF_VAR_UDS_CORE} \
+                --set K3D_EXTRA_ARGS="--gpus=all \
+                --image=localhost:${ZARF_VAR_REG_PORT}/defenseunicorns/leapfrogai/k3d-gpu:###ZARF_PKG_TMPL_VERSION###" \
+                --no-progress --insecure --confirm
+            description: "Create a k3d cluster with GPU support"
+      onRemove:
+        before:
+          - cmd: |
+              k3d cluster delete uds
+            description: "Delete the k3d cluster"
+
+  - name: test-cluster
+    required: true
+    files:
+      - source: test/cuda-vector-add.yaml
+        target: test/cuda-vector-add.yaml
+    actions:
+      onDeploy:
+        before:
+          - cmd: |
+              uds zarf tools kubectl apply -f ./test/cuda-vector-add.yaml
+              uds zarf tools kubectl wait --for=jsonpath='{.status.phase}'=Succeeded --timeout=15s pod -l app=gpu-pod
+              uds zarf tools kubectl logs -l app=gpu-pod
+            description: "Run the test pod"
+        after:
+          - cmd: |
+              uds zarf tools kubectl delete -f ./test/cuda-vector-add.yaml
+            description: "Delete the test pod"