diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 4f39ac676..0f891a599 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -40,6 +40,15 @@ jobs: with: python-version-file: 'pyproject.toml' + - name: Build and Publish k3d-gpu image + run: | + cd packages/k3d-gpu + docker build \ + --platform linux/amd64 \ + -t ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:${{ steps.get_version.outputs.version-without-v }} . + docker push ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:${{ steps.get_version.outputs.version-without-v }} + cd ../.. + - name: Download Python Wheels and Publish Builder Image run: | docker buildx build --platform amd64,arm64 -t ghcr.io/defenseunicorns/leapfrogai/leapfrogai-sdk:${{ steps.get_version.outputs.version-without-v }} --push -f src/leapfrogai_sdk/Dockerfile . diff --git a/Makefile b/Makefile index e83e76d5f..bff055321 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,7 @@ ARCH ?= amd64 KEY ?= "" REG_PORT ?= 5000 +REG_NAME ?= registry VERSION ?= $(shell git describe --abbrev=0 --tags) LOCAL_VERSION ?= $(shell git rev-parse --short HEAD) @@ -33,7 +34,16 @@ gen-python: ## Generate the protobufs for the OpenAI typing within the leapfroga src/leapfrogai_sdk/proto/leapfrogai_sdk/**/*.proto local-registry: ## Start up a local container registry. Errors in this target are ignored. - -docker run -d -p ${REG_PORT}:5000 --restart=always --name registry registry:2 + @echo "Creating local Docker registry..." + -@docker run -d -p ${REG_PORT}:5000 --restart=always --name ${REG_NAME} registry:2 + @echo "Local registry created at localhost:${REG_PORT}" + + +# Clean up: Stop and remove the local registry +clean-registry: + @echo "Cleaning up..." + @docker stop ${REG_NAME} + @docker rm ${REG_NAME} sdk-wheel: ## build wheels for the leapfrogai_sdk package as a dependency for other lfai components docker build --platform=linux/${ARCH} -t ghcr.io/defenseunicorns/leapfrogai/leapfrogai-sdk:${LOCAL_VERSION} -f src/leapfrogai_sdk/Dockerfile . @@ -151,3 +161,5 @@ build-gpu: build-supabase build-api build-ui build-vllm build-text-embeddings bu build-all: build-cpu build-gpu ## Build all of the LFAI packages include tests/make-tests.mk + +include packages/k3d-gpu/Makefile diff --git a/packages/k3d-gpu/Dockerfile b/packages/k3d-gpu/Dockerfile new file mode 100644 index 000000000..e2d14614a --- /dev/null +++ b/packages/k3d-gpu/Dockerfile @@ -0,0 +1,34 @@ +ARG K3S_TAG=v1.28.8-k3s1 +ARG CUDA_TAG=12.4.1-base-ubuntu22.04 + +FROM rancher/k3s:$K3S_TAG AS k3s +FROM nvidia/cuda:$CUDA_TAG + +# Install the NVIDIA container toolkit +RUN apt-get update && apt-get install -y curl \ + && curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ + && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \ + && apt-get update && apt-get install -y nvidia-container-toolkit-base nvidia-container-toolkit nvidia-container-runtime util-linux \ + && nvidia-ctk runtime configure --runtime=containerd + +COPY --from=k3s / / --exclude=/bin/ +COPY --from=k3s /bin /bin + +# Deploy the nvidia driver plugin on startup +COPY plugin/device-plugin-daemonset.yaml /var/lib/rancher/k3s/server/manifests/nvidia-device-plugin-daemonset.yaml + +VOLUME /var/lib/kubelet +VOLUME /var/lib/rancher/k3s +VOLUME /var/lib/cni +VOLUME /var/log + +# DIFF: resolve fsnotify issues +RUN sysctl -w fs.inotify.max_user_watches=100000 +RUN sysctl -w fs.inotify.max_user_instances=100000 + +ENV PATH="$PATH:/bin/aux" + +ENTRYPOINT ["/bin/k3s"] +CMD ["agent"] diff --git a/packages/k3d-gpu/Makefile b/packages/k3d-gpu/Makefile new file mode 100644 index 000000000..7dfc7e5e9 --- /dev/null +++ b/packages/k3d-gpu/Makefile @@ -0,0 +1,26 @@ +MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) + +UDS_VERSION := 0.24.1 +LOCAL_VERSION ?= $(shell git rev-parse --short HEAD) + + +build-k3d-gpu: + @cd ${MAKEFILE_DIR} && \ + docker build \ + --platform linux/amd64 \ + -t ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:${LOCAL_VERSION} . + +create-uds-gpu-cluster: build-k3d-gpu + @uds deploy k3d-core-slim-dev:${UDS_VERSION} \ + --set K3D_EXTRA_ARGS="--gpus=all \ + --image=ghcr.io/defenseunicorns/leapfrogai/k3d-gpu:${LOCAL_VERSION}" --confirm + +test-uds-gpu-cluster: + @cd ${MAKEFILE_DIR} && \ + uds zarf tools kubectl apply -f ./test/cuda-vector-add.yaml + @uds zarf tools kubectl wait --for=jsonpath='{.status.phase}'=Succeeded --timeout=15s pod -l app=gpu-pod + @uds zarf tools kubectl logs -l app=gpu-pod + @cd ${MAKEFILE_DIR} && \ + uds zarf tools kubectl delete -f ./test/cuda-vector-add.yaml + +.PHONY: build-k3d-gpu create-uds-gpu-cluster test-uds-gpu-cluster diff --git a/packages/k3d-gpu/README.md b/packages/k3d-gpu/README.md new file mode 100644 index 000000000..dbe1e534a --- /dev/null +++ b/packages/k3d-gpu/README.md @@ -0,0 +1,28 @@ +# K3D GPU + +Prepares `k3s` + `nvidia/cuda` base image that enables a K3D cluster to have access to your host machine's NVIDIA, CUDA-capable GPU(s). + +## Pre-Requisites + +* Docker: https://www.docker.com/ +* K3D: https://k3d.io/ +* UDS-CLI: https://github.com/defenseunicorns/uds-cli +* Modern NVIDIA GPU with CUDA cores and drivers must be present. Additionally, the CUDA toolkit and NVIDIA container toolkit must be installed. + +## Usage + +Check out the Make targets for the various options. + +### Local + +```shell +make build-k3d-gpu # build the image + +make create-uds-gpu-cluster # create a uds cluster equipped with the k3d-gpu image + +make test-uds-gpu-cluster # deploy a test gpu pod to see if everything is working +``` + +## References + +* https://k3d.io/v5.7.2/usage/advanced/cuda/ diff --git a/packages/k3d-gpu/plugin/device-plugin-daemonset.yaml b/packages/k3d-gpu/plugin/device-plugin-daemonset.yaml new file mode 100644 index 000000000..202280341 --- /dev/null +++ b/packages/k3d-gpu/plugin/device-plugin-daemonset.yaml @@ -0,0 +1,61 @@ +apiVersion: node.k8s.io/v1 +kind: RuntimeClass +metadata: + name: nvidia +handler: nvidia +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-device-plugin-daemonset + namespace: kube-system +spec: + selector: + matchLabels: + name: nvidia-device-plugin-daemonset + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + name: nvidia-device-plugin-daemonset + spec: + runtimeClassName: nvidia # Explicitly request the runtime + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + # Mark this pod as a critical add-on; when enabled, the critical add-on + # scheduler reserves resources for critical add-on pods so that they can + # be rescheduled after a failure. + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + priorityClassName: "system-node-critical" + containers: + - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0-rc.2 + name: nvidia-device-plugin-ctr + env: + - name: PASS_DEVICE_SPECS + value: "true" + - name: FAIL_ON_INIT_ERROR + value: "true" + - name: DEVICE_LIST_STRATEGY + value: envvar + - name: DEVICE_ID_STRATEGY + value: uuid + - name: NVIDIA_VISIBLE_DEVICES + value: all + - name: NVIDIA_DRIVER_CAPABILITIES + value: all + - name: MPS_ROOT + value: /run/nvidia/mps + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins diff --git a/packages/k3d-gpu/test/cuda-vector-add.yaml b/packages/k3d-gpu/test/cuda-vector-add.yaml new file mode 100644 index 000000000..019881296 --- /dev/null +++ b/packages/k3d-gpu/test/cuda-vector-add.yaml @@ -0,0 +1,21 @@ +apiVersion: v1 +kind: Pod +metadata: + name: gpu-pod + labels: + app: gpu-pod +spec: + runtimeClassName: nvidia + restartPolicy: Never + containers: + - name: cuda-container + image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda10.2 + resources: + limits: + nvidia.com/gpu: "1" # requesting 1 GPU + cpu: "1" + memory: 0.5Gi + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule