From 0b302b319fac66b8a782ae0ea64ee4ac21ba96ed Mon Sep 17 00:00:00 2001 From: Laura Lorenz Date: Wed, 9 Oct 2024 21:56:51 +0000 Subject: [PATCH 1/2] Add changes for alpha, node types, nvidia driver --- platforms/gke-aiml/playground/configsync.tf | 25 ++++ .../gke-aiml/playground/container_cluster.tf | 21 ++- .../playground/container_node_pool.tf | 88 ++++++++++-- .../scripts/nvidia_dra_driver_manifests.sh | 58 ++++++++ .../_cluster_template/dra/kustomization.yaml | 17 +++ .../dra/nvidia-dra-drivers/kustomization.yaml | 25 ++++ ...-driver-installer-daemonset-preloaded.yaml | 126 ++++++++++++++++++ ...r-installer-prepare-gke-nodes-for-dra.yaml | 57 ++++++++ .../dra/nvidia-dra-drivers/values.yaml | 38 ++++++ .../_cluster_template/namespace-nvidia.yaml | 18 +++ terraform/modules/cluster/gke.tf | 8 +- terraform/modules/node-pools/nodepools.tf | 3 +- 12 files changed, 461 insertions(+), 23 deletions(-) create mode 100755 platforms/gke-aiml/playground/scripts/nvidia_dra_driver_manifests.sh create mode 100644 platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/kustomization.yaml create mode 100644 platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/kustomization.yaml create mode 100644 platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/nvidia-driver-installer-daemonset-preloaded.yaml create mode 100644 platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/nvidia-driver-installer-prepare-gke-nodes-for-dra.yaml create mode 100644 platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/values.yaml create mode 100644 platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/namespace-nvidia.yaml diff --git a/platforms/gke-aiml/playground/configsync.tf b/platforms/gke-aiml/playground/configsync.tf index 41a7457b..3f6f8923 100644 --- a/platforms/gke-aiml/playground/configsync.tf +++ b/platforms/gke-aiml/playground/configsync.tf @@ -161,6 +161,31 @@ resource "null_resource" "kueue" { # } # } +# NVIDIA DRA DRIVER +############################################################################### +resource "null_resource" "nvidia_dra_driver" { + depends_on = [ + google_gke_hub_feature_membership.cluster_configmanagement, + google_secret_manager_secret_version.git_config, + module.configsync_repository, + ] + + provisioner "local-exec" { + command = "${path.module}/scripts/nvidia_dra_driver_manifests.sh" + environment = { + GIT_CONFIG_SECRET_NAME = local.git_config_secret_name + GIT_REPOSITORY = local.git_repository + MANIFESTS_DIRECTORY = local.configsync_manifests_directory + PROJECT_ID = data.google_project.environment.project_id + MLP_AR_REPO_URL = "${google_artifact_registry_repository.container_images.location}-docker.pkg.dev/${google_artifact_registry_repository.container_images.project}/${var.environment_name}/k8s-dra-driver:v0.1.0" + } + } + + triggers = { + md5_files = md5(join("", [for f in fileset("${path.module}/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers", "**") : md5("${path.module}/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/${f}")])) + md5_script = filemd5("${path.module}/scripts/nvidia_dra_driver_manifests.sh") + } +} # KUBERAY MANIFESTS diff --git a/platforms/gke-aiml/playground/container_cluster.tf b/platforms/gke-aiml/playground/container_cluster.tf index a9bbb5ee..041d8cad 100644 --- a/platforms/gke-aiml/playground/container_cluster.tf +++ b/platforms/gke-aiml/playground/container_cluster.tf @@ -54,6 +54,7 @@ resource "google_container_cluster" "mlp" { project = data.google_project.environment.project_id remove_default_node_pool = false subnetwork = module.create-vpc.subnet-1 + enable_kubernetes_alpha = true addons_config { gcp_filestore_csi_driver_config { @@ -74,15 +75,18 @@ resource "google_container_cluster" "mlp" { enabled = true auto_provisioning_defaults { - disk_type = "pd-balanced" + disk_type = "pd-standard" + disk_size = 100 + image_type = "UBUNTU_CONTAINERD" oauth_scopes = [ - "https://www.googleapis.com/auth/cloud-platform" + "https://www.googleapis.com/auth/cloud-platform", + "https://www.googleapis.com/auth/devstorage.read_only" ] service_account = google_service_account.cluster.email management { - auto_repair = true - auto_upgrade = true + auto_repair = false + auto_upgrade = false } shielded_instance_config { @@ -221,11 +225,17 @@ resource "google_container_cluster" "mlp" { enable_private_nodes = true } + management { + auto_repair = false + auto_upgrade = false + } + node_config { machine_type = "e2-standard-4" service_account = google_service_account.cluster.email oauth_scopes = [ - "https://www.googleapis.com/auth/cloud-platform" + "https://www.googleapis.com/auth/cloud-platform", + "https://www.googleapis.com/auth/devstorage.read_only" ] gcfs_config { @@ -245,6 +255,7 @@ resource "google_container_cluster" "mlp" { enabled = true } } + } private_cluster_config { diff --git a/platforms/gke-aiml/playground/container_node_pool.tf b/platforms/gke-aiml/playground/container_node_pool.tf index 28a637b0..8362abbd 100644 --- a/platforms/gke-aiml/playground/container_node_pool.tf +++ b/platforms/gke-aiml/playground/container_node_pool.tf @@ -36,6 +36,11 @@ resource "google_container_node_pool" "cpu_n4s8" { total_min_node_count = 1 } + management { + auto_repair = false + auto_upgrade = false + } + network_config { enable_private_nodes = true } @@ -49,7 +54,8 @@ resource "google_container_node_pool" "cpu_n4s8" { machine_type = "n4-standard-8" service_account = google_service_account.cluster.email oauth_scopes = [ - "https://www.googleapis.com/auth/cloud-platform" + "https://www.googleapis.com/auth/cloud-platform", + "https://www.googleapis.com/auth/devstorage.read_only" ] # Blocks @@ -78,16 +84,18 @@ resource "google_container_node_pool" "cpu_n4s8" { } timeouts { - create = "30m" - update = "20m" + create = "1m" + update = "1m" + delete = "1m" } } -# GPU +# GPU node shapes # Available zones: https://cloud.google.com/compute/docs/gpus/gpu-regions-zones#view-using-table -############################################################################### +############################################################################## +# TODO: don't need for DRA demo resource "google_container_node_pool" "gpu_a100x2_a2h2" { depends_on = [google_gke_hub_membership.cluster] @@ -110,6 +118,11 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2" { total_min_node_count = 0 } + management { + auto_repair = false + auto_upgrade = false + } + lifecycle { ignore_changes = [ node_config[0].labels, @@ -130,7 +143,8 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2" { machine_type = "a2-highgpu-2g" service_account = google_service_account.cluster.email oauth_scopes = [ - "https://www.googleapis.com/auth/cloud-platform" + "https://www.googleapis.com/auth/cloud-platform", + "https://www.googleapis.com/auth/devstorage.read_only" ] # Blocks @@ -173,8 +187,8 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2" { } } -############################################################################### - +############################################################################## +# TODO: don't need for DRA demo resource "google_container_node_pool" "gpu_a100x2_a2h2_dws" { depends_on = [google_gke_hub_membership.cluster] @@ -197,6 +211,11 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2_dws" { total_min_node_count = 0 } + management { + auto_repair = false + auto_upgrade = false + } + lifecycle { ignore_changes = [ node_config[0].labels, @@ -217,7 +236,8 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2_dws" { machine_type = "a2-highgpu-2g" service_account = google_service_account.cluster.email oauth_scopes = [ - "https://www.googleapis.com/auth/cloud-platform" + "https://www.googleapis.com/auth/cloud-platform", + "https://www.googleapis.com/auth/devstorage.read_only" ] # Blocks @@ -286,6 +306,11 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_dws" { total_min_node_count = 0 } + management { + auto_repair = false + auto_upgrade = false + } + lifecycle { ignore_changes = [ node_config[0].labels, @@ -301,12 +326,17 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_dws" { # Variables labels = { "resource-model" : "h100" - "resource-type" : "gpu" + "resource-type" : "gpu", + "gke-no-default-nvidia-gpu-device-plugin" : "true", + "nvidia.com/gpu" : "present", + "nvidia.com/dra.kubelet-plugin" : "true" + "nvidia.com/dra.controller" : "true" } machine_type = "a3-highgpu-8g" service_account = google_service_account.cluster.email oauth_scopes = [ - "https://www.googleapis.com/auth/cloud-platform" + "https://www.googleapis.com/auth/cloud-platform", + "https://www.googleapis.com/auth/devstorage.read_only" ] # Blocks @@ -379,6 +409,11 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24" { total_min_node_count = 0 } + management { + auto_repair = false + auto_upgrade = false + } + lifecycle { ignore_changes = [ node_config[0].labels, @@ -395,11 +430,16 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24" { labels = { "resource-model" : "l4" "resource-type" : "gpu" + "gke-no-default-nvidia-gpu-device-plugin" : "true", + "nvidia.com/gpu" : "present", + "nvidia.com/dra.kubelet-plugin" : "true" + "nvidia.com/dra.controller" : "true" } machine_type = "g2-standard-24" service_account = google_service_account.cluster.email oauth_scopes = [ - "https://www.googleapis.com/auth/cloud-platform" + "https://www.googleapis.com/auth/cloud-platform", + "https://www.googleapis.com/auth/devstorage.read_only" ] # Blocks @@ -464,6 +504,11 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_dws" { total_min_node_count = 0 } + management { + auto_repair = false + auto_upgrade = false + } + lifecycle { ignore_changes = [ node_config[0].labels, @@ -480,11 +525,16 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_dws" { labels = { "resource-model" : "l4" "resource-type" : "gpu" + "gke-no-default-nvidia-gpu-device-plugin" : "true", + "nvidia.com/gpu" : "present", + "nvidia.com/dra.kubelet-plugin" : "true" + "nvidia.com/dra.controller" : "true" } machine_type = "g2-standard-24" service_account = google_service_account.cluster.email oauth_scopes = [ - "https://www.googleapis.com/auth/cloud-platform" + "https://www.googleapis.com/auth/cloud-platform", + "https://www.googleapis.com/auth/devstorage.read_only" ] # Blocks @@ -554,6 +604,11 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_spot" { total_min_node_count = 0 } + management { + auto_repair = false + auto_upgrade = false + } + lifecycle { ignore_changes = [ node_config[0].labels, @@ -570,11 +625,16 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_spot" { labels = { "resource-model" : "l4" "resource-type" : "gpu" + "gke-no-default-nvidia-gpu-device-plugin" : "true", + "nvidia.com/gpu" : "present", + "nvidia.com/dra.kubelet-plugin" : "true" + "nvidia.com/dra.controller" : "true" } machine_type = "g2-standard-24" service_account = google_service_account.cluster.email oauth_scopes = [ - "https://www.googleapis.com/auth/cloud-platform" + "https://www.googleapis.com/auth/cloud-platform", + "https://www.googleapis.com/auth/devstorage.read_only" ] spot = true diff --git a/platforms/gke-aiml/playground/scripts/nvidia_dra_driver_manifests.sh b/platforms/gke-aiml/playground/scripts/nvidia_dra_driver_manifests.sh new file mode 100755 index 00000000..2b23ddce --- /dev/null +++ b/platforms/gke-aiml/playground/scripts/nvidia_dra_driver_manifests.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -u + +SCRIPT_PATH="$( + cd "$(dirname "$0")" >/dev/null 2>&1 + pwd -P +)" + +source ${SCRIPT_PATH}/helpers/clone_git_repo.sh + +# Set directory and path variables +clusters_directory="manifests/clusters" +clusters_path="${GIT_REPOSITORY_PATH}/${clusters_directory}" +cluster_template_directory="templates/_cluster_template" +cluster_template_path="${GIT_REPOSITORY_PATH}/${cluster_template_directory}" +repo_container_images_url="${MLP_AR_REPO_URL}" + +cd "${clusters_path}" || { + echo "Clusters directory '${clusters_directory}' does not exist" + exit 100 +} + +echo "'${repo_container_images_url}'" + +cp -pr ${cluster_template_path}/dra/nvidia-dra-drivers ${clusters_path}/ +cp -pr ${cluster_template_path}/namespace-nvidia.yaml ${clusters_path}/ + +# Added entries to the kustomization file +resources=$(find ${clusters_path} -maxdepth 1 -mindepth 1 -type d | sort) +resources+=" " +export resources+=$(find ${clusters_path} -maxdepth 1 -type f -name "*.yaml" ! -name "kustomization.yaml" ! -name "*values.yaml" | sort) +export kustomization_file="${clusters_path}/kustomization.yaml" +source ${SCRIPT_PATH}/helpers/add_to_kustomization.sh + +#TODO: build the image here as well +# something like `./${k8s-nvidia-driver-path}/demo/clusters/kind/scripts/build-driver-image.sh` +docker tag nvcr.io/nvidia/cloud-native/k8s-dra-driver:v0.1.0 ${repo_container_images_url} +docker push ${repo_container_images_url} + +# Add, commit, and push changes to the repository +cd ${GIT_REPOSITORY_PATH} +git add . +git commit -m "Manifests for NVIDIA DRA" +git push origin diff --git a/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/kustomization.yaml b/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/kustomization.yaml new file mode 100644 index 00000000..fe298fe7 --- /dev/null +++ b/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/kustomization.yaml @@ -0,0 +1,17 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + diff --git a/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/kustomization.yaml b/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/kustomization.yaml new file mode 100644 index 00000000..f3bb902d --- /dev/null +++ b/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/kustomization.yaml @@ -0,0 +1,25 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# Another option is to use the NVIDIA Helm chart +# helmCharts: +# - name: k8s-dra-driver +# repo: https://github.com/NVIDIA/k8s-dra-driver/tree/main/deployments/helm +# version: 0.1.0 +# releaseName: k8s-dra-driver +# includeCRDs: true +# valuesFile: ??.yaml \ No newline at end of file diff --git a/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/nvidia-driver-installer-daemonset-preloaded.yaml b/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/nvidia-driver-installer-daemonset-preloaded.yaml new file mode 100644 index 00000000..7cb4c8d5 --- /dev/null +++ b/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/nvidia-driver-installer-daemonset-preloaded.yaml @@ -0,0 +1,126 @@ +# Copyright 2017 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This daemonset installs nvidia driver 450.80.02 and invokes the +# partition_gpu tool to enable MIG mode and create GPU instances as specified +# in the GPU config. + +##### SOURCE: https://github.com/GoogleCloudPlatform/container-engine-accelerators/blob/master/nvidia-driver-installer/cos/daemonset-nvidia-mig.yaml +### NEED UBUNTU? check https://github.com/GoogleCloudPlatform/container-engine-accelerators/blob/master/nvidia-driver-installer/ubuntu/daemonset.yaml + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-driver-installer + namespace: kube-system + labels: + k8s-app: nvidia-driver-installer +spec: + selector: + matchLabels: + k8s-app: nvidia-driver-installer + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + name: nvidia-driver-installer + k8s-app: nvidia-driver-installer + spec: + priorityClassName: system-node-critical + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-accelerator + operator: Exists + tolerations: + - operator: "Exists" + hostNetwork: true + hostPID: true + volumes: + - name: dev + hostPath: + path: /dev + - name: vulkan-icd-mount + hostPath: + path: /home/kubernetes/bin/nvidia/vulkan/icd.d + - name: nvidia-install-dir-host + hostPath: + path: /home/kubernetes/bin/nvidia + - name: root-mount + hostPath: + path: / + - name: cos-tools + hostPath: + path: /var/lib/cos-tools + - name: nvidia-config + hostPath: + path: /etc/nvidia + initContainers: + - image: "cos-nvidia-installer:fixed" + imagePullPolicy: Never + name: nvidia-driver-installer + resources: + requests: + cpu: 150m + securityContext: + privileged: true + env: + - name: NVIDIA_INSTALL_DIR_HOST + value: /home/kubernetes/bin/nvidia + - name: NVIDIA_INSTALL_DIR_CONTAINER + value: /usr/local/nvidia + - name: VULKAN_ICD_DIR_HOST + value: /home/kubernetes/bin/nvidia/vulkan/icd.d + - name: VULKAN_ICD_DIR_CONTAINER + value: /etc/vulkan/icd.d + - name: ROOT_MOUNT_DIR + value: /root + - name: COS_TOOLS_DIR_HOST + value: /var/lib/cos-tools + - name: COS_TOOLS_DIR_CONTAINER + value: /build/cos-tools + volumeMounts: + - name: nvidia-install-dir-host + mountPath: /usr/local/nvidia + - name: vulkan-icd-mount + mountPath: /etc/vulkan/icd.d + - name: dev + mountPath: /dev + - name: root-mount + mountPath: /root + - name: cos-tools + mountPath: /build/cos-tools + - image: "gcr.io/gke-release/nvidia-partition-gpu@sha256:e226275da6c45816959fe43cde907ee9a85c6a2aa8a429418a4cadef8ecdb86a" + name: partition-gpus + env: + - name: LD_LIBRARY_PATH + value: /usr/local/nvidia/lib64 + resources: + requests: + cpu: 150m + securityContext: + privileged: true + volumeMounts: + - name: nvidia-install-dir-host + mountPath: /usr/local/nvidia + - name: dev + mountPath: /dev + - name: nvidia-config + mountPath: /etc/nvidia + containers: + - image: "registry.k8s.io/pause:3.9" + name: pause \ No newline at end of file diff --git a/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/nvidia-driver-installer-prepare-gke-nodes-for-dra.yaml b/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/nvidia-driver-installer-prepare-gke-nodes-for-dra.yaml new file mode 100644 index 00000000..fbcff002 --- /dev/null +++ b/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/nvidia-driver-installer-prepare-gke-nodes-for-dra.yaml @@ -0,0 +1,57 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: prepare-gpu-node-for-dra + namespace: nvidia + labels: + app: prepare-gpu-node-for-dra +spec: + selector: + matchLabels: + app: prepare-gpu-node-for-dra + template: + metadata: + labels: + app: prepare-gpu-node-for-dra + spec: + hostPID: true + hostIPC: true + nodeSelector: + nvidia.com/gpu: present + containers: + - image: ubuntu:22.04 + name: ctr + command: ["bash", "-c"] + args: + - |- + chroot /host bash -c "until /opt/nvidia/bin/nvidia-smi; do :; done" + chroot /host bash -c " + curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --yes --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ + && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ + sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ + tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \ + && \ + apt-get -o DPkg::Lock::Timeout=60 update \ + && \ + apt-get -o DPkg::Lock::Timeout=60 install nvidia-container-toolkit-base"; + chroot /host sed -i -e '/\[plugins."io.containerd.grpc.v1.cri"\]/a \ \ enable_cdi = true' /etc/containerd/config.toml; + chroot /host systemctl restart containerd; + sleep infinity; + securityContext: + privileged: true + volumeMounts: + - name: host-root + mountPath: /host + - name: host-sys + mountPath: /sys + volumes: + - name: host-root + hostPath: + path: / + - name: host-sys + hostPath: + path: /sys + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule \ No newline at end of file diff --git a/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/values.yaml b/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/values.yaml new file mode 100644 index 00000000..96921b21 --- /dev/null +++ b/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/values.yaml @@ -0,0 +1,38 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# their old image +# image: +# repository: ghcr.io/nvidia/k8s-dra-driver +# tag: 9323da2d-ubuntu20.04 +# pullPolicy: Always + +# my image +image: + repository: us-central1-docker.pkg.dev/lauralorenz-gke-dev/dra/k8s-dra-driver + tag: v0.1.0 + pullPolicy: Always + +controller: + priorityClassName: "" + +kubeletPlugin: + priorityClassName: "" + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + +nvidiaDriverRoot: "/opt/nvidia" \ No newline at end of file diff --git a/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/namespace-nvidia.yaml b/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/namespace-nvidia.yaml new file mode 100644 index 00000000..9c8da2e7 --- /dev/null +++ b/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/namespace-nvidia.yaml @@ -0,0 +1,18 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Namespace +metadata: + name: nvidia diff --git a/terraform/modules/cluster/gke.tf b/terraform/modules/cluster/gke.tf index e046e697..7ad6d3fb 100644 --- a/terraform/modules/cluster/gke.tf +++ b/terraform/modules/cluster/gke.tf @@ -32,6 +32,7 @@ resource "google_container_cluster" "mlp" { project = var.project_id remove_default_node_pool = var.remove_default_node_pool subnetwork = var.subnet + enable_kubernetes_alpha = true addons_config { gcp_filestore_csi_driver_config { @@ -54,12 +55,13 @@ resource "google_container_cluster" "mlp" { auto_provisioning_defaults { service_account = var.service_account oauth_scopes = [ - "https://www.googleapis.com/auth/cloud-platform" + "https://www.googleapis.com/auth/cloud-platform", + "https://www.googleapis.com/auth/devstorage.read_only" ] management { - auto_repair = true - auto_upgrade = true + auto_repair = false + auto_upgrade = false } shielded_instance_config { diff --git a/terraform/modules/node-pools/nodepools.tf b/terraform/modules/node-pools/nodepools.tf index 79b47ef0..fcf2e5c6 100644 --- a/terraform/modules/node-pools/nodepools.tf +++ b/terraform/modules/node-pools/nodepools.tf @@ -39,7 +39,8 @@ resource "google_container_node_pool" "node-pool" { machine_type = var.machine_type service_account = var.service_account oauth_scopes = [ - "https://www.googleapis.com/auth/cloud-platform" + "https://www.googleapis.com/auth/cloud-platform", + "https://www.googleapis.com/auth/devstorage.read_only" ] gcfs_config { From 3f78ce7e047356a17f3174fcf903751d7be06a9e Mon Sep 17 00:00:00 2001 From: Laura Lorenz Date: Wed, 9 Oct 2024 23:50:38 +0000 Subject: [PATCH 2/2] expose master endpoint, DRA specific nodes, help the cpu node --- .../gke-aiml/playground/container_cluster.tf | 4 +- .../playground/container_node_pool.tf | 461 +++++++++++++----- terraform/modules/cluster/gke.tf | 2 +- 3 files changed, 329 insertions(+), 138 deletions(-) diff --git a/platforms/gke-aiml/playground/container_cluster.tf b/platforms/gke-aiml/playground/container_cluster.tf index 041d8cad..25f24704 100644 --- a/platforms/gke-aiml/playground/container_cluster.tf +++ b/platforms/gke-aiml/playground/container_cluster.tf @@ -75,7 +75,7 @@ resource "google_container_cluster" "mlp" { enabled = true auto_provisioning_defaults { - disk_type = "pd-standard" + disk_type = "pd-balanced" disk_size = 100 image_type = "UBUNTU_CONTAINERD" oauth_scopes = [ @@ -260,7 +260,7 @@ resource "google_container_cluster" "mlp" { private_cluster_config { enable_private_nodes = true - enable_private_endpoint = true + enable_private_endpoint = false master_ipv4_cidr_block = "172.16.0.32/28" } diff --git a/platforms/gke-aiml/playground/container_node_pool.tf b/platforms/gke-aiml/playground/container_node_pool.tf index 8362abbd..9155901a 100644 --- a/platforms/gke-aiml/playground/container_node_pool.tf +++ b/platforms/gke-aiml/playground/container_node_pool.tf @@ -95,116 +95,314 @@ resource "google_container_node_pool" "cpu_n4s8" { # GPU node shapes # Available zones: https://cloud.google.com/compute/docs/gpus/gpu-regions-zones#view-using-table ############################################################################## -# TODO: don't need for DRA demo -resource "google_container_node_pool" "gpu_a100x2_a2h2" { - depends_on = [google_gke_hub_membership.cluster] - - # Variables - cluster = google_container_cluster.mlp.name - location = var.subnet_01_region - name = "gpu-a100x2-a2h2" - node_locations = [ - "${var.subnet_01_region}-a", - "${var.subnet_01_region}-b", - "${var.subnet_01_region}-c", - "${var.subnet_01_region}-f" - ] - project = data.google_project.environment.project_id - # Blocks - autoscaling { - location_policy = "ANY" - total_max_node_count = 1000 - total_min_node_count = 0 - } +# resource "google_container_node_pool" "gpu_a100x2_a2h2" { +# depends_on = [google_gke_hub_membership.cluster] + +# # Variables +# cluster = google_container_cluster.mlp.name +# location = var.subnet_01_region +# name = "gpu-a100x2-a2h2" +# node_locations = [ +# "${var.subnet_01_region}-a", +# "${var.subnet_01_region}-b", +# "${var.subnet_01_region}-c", +# "${var.subnet_01_region}-f" +# ] +# project = data.google_project.environment.project_id + +# # Blocks +# autoscaling { +# location_policy = "ANY" +# total_max_node_count = 1000 +# total_min_node_count = 0 +# } + +# management { +# auto_repair = false +# auto_upgrade = false +# } + +# lifecycle { +# ignore_changes = [ +# node_config[0].labels, +# node_config[0].taint, +# ] +# } + +# network_config { +# enable_private_nodes = true +# } + +# node_config { +# # Variables +# labels = { +# "resource-model" : "a100" +# "resource-type" : "gpu" +# } +# machine_type = "a2-highgpu-2g" +# service_account = google_service_account.cluster.email +# oauth_scopes = [ +# "https://www.googleapis.com/auth/cloud-platform", +# "https://www.googleapis.com/auth/devstorage.read_only" +# ] + +# # Blocks +# gcfs_config { +# enabled = true +# } + +# guest_accelerator { +# count = 2 +# type = "nvidia-tesla-a100" + +# gpu_driver_installation_config { +# gpu_driver_version = var.gpu_driver_version +# } +# } + +# gvnic { +# enabled = true +# } + +# reservation_affinity { +# consume_reservation_type = "NO_RESERVATION" +# } + +# shielded_instance_config { +# enable_integrity_monitoring = true +# enable_secure_boot = true +# } + +# taint { +# effect = "NO_SCHEDULE" +# key = "on-demand" +# value = true +# } +# } + +# timeouts { +# create = "30m" +# update = "20m" +# } +# } - management { - auto_repair = false - auto_upgrade = false - } - - lifecycle { - ignore_changes = [ - node_config[0].labels, - node_config[0].taint, - ] - } - - network_config { - enable_private_nodes = true - } - - node_config { - # Variables - labels = { - "resource-model" : "a100" - "resource-type" : "gpu" - } - machine_type = "a2-highgpu-2g" - service_account = google_service_account.cluster.email - oauth_scopes = [ - "https://www.googleapis.com/auth/cloud-platform", - "https://www.googleapis.com/auth/devstorage.read_only" - ] - - # Blocks - gcfs_config { - enabled = true - } - - guest_accelerator { - count = 2 - type = "nvidia-tesla-a100" - - gpu_driver_installation_config { - gpu_driver_version = var.gpu_driver_version - } - } +############################################################################## - gvnic { - enabled = true - } +# resource "google_container_node_pool" "gpu_a100x2_a2h2_dws" { +# depends_on = [google_gke_hub_membership.cluster] + +# # Variables +# cluster = google_container_cluster.mlp.name +# location = var.subnet_01_region +# name = "gpu-a100x2-a2h2-dws" +# node_locations = [ +# "${var.subnet_01_region}-a", +# "${var.subnet_01_region}-b", +# "${var.subnet_01_region}-c", +# "${var.subnet_01_region}-f" +# ] +# project = data.google_project.environment.project_id + +# # Blocks +# autoscaling { +# location_policy = "ANY" +# total_max_node_count = 1000 +# total_min_node_count = 0 +# } + +# management { +# auto_repair = false +# auto_upgrade = false +# } + +# lifecycle { +# ignore_changes = [ +# node_config[0].labels, +# node_config[0].taint, +# ] +# } + +# network_config { +# enable_private_nodes = true +# } + +# node_config { +# # Variables +# labels = { +# "resource-model" : "a100" +# "resource-type" : "gpu" +# } +# machine_type = "a2-highgpu-2g" +# service_account = google_service_account.cluster.email +# oauth_scopes = [ +# "https://www.googleapis.com/auth/cloud-platform", +# "https://www.googleapis.com/auth/devstorage.read_only" +# ] + +# # Blocks +# gcfs_config { +# enabled = true +# } + +# guest_accelerator { +# count = 2 +# type = "nvidia-tesla-a100" + +# gpu_driver_installation_config { +# gpu_driver_version = var.gpu_driver_version +# } +# } + +# gvnic { +# enabled = true +# } + +# reservation_affinity { +# consume_reservation_type = "NO_RESERVATION" +# } + +# shielded_instance_config { +# enable_integrity_monitoring = true +# enable_secure_boot = true +# } + +# taint { +# effect = "NO_SCHEDULE" +# key = "on-demand" +# value = true +# } +# } + +# queued_provisioning { +# enabled = true +# } + +# timeouts { +# create = "30m" +# update = "20m" +# } +# } - reservation_affinity { - consume_reservation_type = "NO_RESERVATION" - } +############################################################################### - shielded_instance_config { - enable_integrity_monitoring = true - enable_secure_boot = true - } +# resource "google_container_node_pool" "gpu_h100x8_a3h8_dws" { +# depends_on = [google_gke_hub_membership.cluster] + +# # Variables +# cluster = google_container_cluster.mlp.name +# location = var.subnet_01_region +# name = "gpu-h100x8-a3h8-dws" +# node_locations = [ +# "${var.subnet_01_region}-a", +# "${var.subnet_01_region}-c" +# ] +# project = data.google_project.environment.project_id + +# # Blocks +# autoscaling { +# location_policy = "ANY" +# total_max_node_count = 1000 +# total_min_node_count = 0 +# } + +# management { +# auto_repair = false +# auto_upgrade = false +# } + +# lifecycle { +# ignore_changes = [ +# node_config[0].labels, +# node_config[0].taint, +# ] +# } + +# network_config { +# enable_private_nodes = true +# } + +# node_config { +# # Variables +# labels = { +# "resource-model" : "h100" +# "resource-type" : "gpu", +# "gke-no-default-nvidia-gpu-device-plugin" : "true", +# "nvidia.com/gpu" : "present", +# "nvidia.com/dra.kubelet-plugin" : "true" +# "nvidia.com/dra.controller" : "true" +# } +# machine_type = "a3-highgpu-8g" +# service_account = google_service_account.cluster.email +# oauth_scopes = [ +# "https://www.googleapis.com/auth/cloud-platform", +# "https://www.googleapis.com/auth/devstorage.read_only" +# ] + +# # Blocks +# ephemeral_storage_local_ssd_config { +# local_ssd_count = 16 +# } + +# gcfs_config { +# enabled = true +# } + +# guest_accelerator { +# count = 8 +# type = "nvidia-h100-80gb" + +# gpu_driver_installation_config { +# gpu_driver_version = var.gpu_driver_version +# } +# } + +# gvnic { +# enabled = true +# } + +# reservation_affinity { +# consume_reservation_type = "NO_RESERVATION" +# } + +# shielded_instance_config { +# enable_integrity_monitoring = true +# enable_secure_boot = true +# } + +# taint { +# effect = "NO_SCHEDULE" +# key = "on-demand" +# value = true +# } +# } + +# queued_provisioning { +# enabled = true +# } + +# timeouts { +# create = "30m" +# update = "20m" +# } +# } - taint { - effect = "NO_SCHEDULE" - key = "on-demand" - value = true - } - } - - timeouts { - create = "30m" - update = "20m" - } -} +############################################################################### -############################################################################## -# TODO: don't need for DRA demo -resource "google_container_node_pool" "gpu_a100x2_a2h2_dws" { +resource "google_container_node_pool" "gpu_l4x2_g2s24" { depends_on = [google_gke_hub_membership.cluster] # Variables cluster = google_container_cluster.mlp.name location = var.subnet_01_region - name = "gpu-a100x2-a2h2-dws" + name = "gpu-l4x2-g2s24" node_locations = [ "${var.subnet_01_region}-a", "${var.subnet_01_region}-b", - "${var.subnet_01_region}-c", - "${var.subnet_01_region}-f" + "${var.subnet_01_region}-c" ] project = data.google_project.environment.project_id - # Blocks autoscaling { location_policy = "ANY" total_max_node_count = 1000 @@ -230,10 +428,14 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2_dws" { node_config { # Variables labels = { - "resource-model" : "a100" + "resource-model" : "l4" "resource-type" : "gpu" + "gke-no-default-nvidia-gpu-device-plugin" : "true", + "nvidia.com/gpu" : "present", + "nvidia.com/dra.kubelet-plugin" : "true" + "nvidia.com/dra.controller" : "true" } - machine_type = "a2-highgpu-2g" + machine_type = "g2-standard-24" service_account = google_service_account.cluster.email oauth_scopes = [ "https://www.googleapis.com/auth/cloud-platform", @@ -247,7 +449,7 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2_dws" { guest_accelerator { count = 2 - type = "nvidia-tesla-a100" + type = "nvidia-l4" gpu_driver_installation_config { gpu_driver_version = var.gpu_driver_version @@ -274,10 +476,6 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2_dws" { } } - queued_provisioning { - enabled = true - } - timeouts { create = "30m" update = "20m" @@ -286,20 +484,20 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2_dws" { ############################################################################### -resource "google_container_node_pool" "gpu_h100x8_a3h8_dws" { +resource "google_container_node_pool" "gpu_l4x2_g2s24_dws" { depends_on = [google_gke_hub_membership.cluster] # Variables cluster = google_container_cluster.mlp.name location = var.subnet_01_region - name = "gpu-h100x8-a3h8-dws" + name = "gpu-l4x2-g2s24-dws" node_locations = [ "${var.subnet_01_region}-a", + "${var.subnet_01_region}-b", "${var.subnet_01_region}-c" ] project = data.google_project.environment.project_id - # Blocks autoscaling { location_policy = "ANY" total_max_node_count = 1000 @@ -325,14 +523,14 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_dws" { node_config { # Variables labels = { - "resource-model" : "h100" - "resource-type" : "gpu", + "resource-model" : "l4" + "resource-type" : "gpu" "gke-no-default-nvidia-gpu-device-plugin" : "true", "nvidia.com/gpu" : "present", "nvidia.com/dra.kubelet-plugin" : "true" "nvidia.com/dra.controller" : "true" } - machine_type = "a3-highgpu-8g" + machine_type = "g2-standard-24" service_account = google_service_account.cluster.email oauth_scopes = [ "https://www.googleapis.com/auth/cloud-platform", @@ -340,17 +538,13 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_dws" { ] # Blocks - ephemeral_storage_local_ssd_config { - local_ssd_count = 16 - } - gcfs_config { enabled = true } guest_accelerator { - count = 8 - type = "nvidia-h100-80gb" + count = 2 + type = "nvidia-l4" gpu_driver_installation_config { gpu_driver_version = var.gpu_driver_version @@ -389,13 +583,13 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_dws" { ############################################################################### -resource "google_container_node_pool" "gpu_l4x2_g2s24" { +resource "google_container_node_pool" "gpu_l4x2_g2s24_spot" { depends_on = [google_gke_hub_membership.cluster] # Variables cluster = google_container_cluster.mlp.name location = var.subnet_01_region - name = "gpu-l4x2-g2s24" + name = "gpu-l4x2-g2s24-spot" node_locations = [ "${var.subnet_01_region}-a", "${var.subnet_01_region}-b", @@ -403,6 +597,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24" { ] project = data.google_project.environment.project_id + # Blocks autoscaling { location_policy = "ANY" total_max_node_count = 1000 @@ -441,6 +636,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24" { "https://www.googleapis.com/auth/cloud-platform", "https://www.googleapis.com/auth/devstorage.read_only" ] + spot = true # Blocks gcfs_config { @@ -471,7 +667,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24" { taint { effect = "NO_SCHEDULE" - key = "on-demand" + key = "spot" value = true } } @@ -482,15 +678,17 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24" { } } +### Attached GPUs ############################################################################### -resource "google_container_node_pool" "gpu_l4x2_g2s24_dws" { +# n1s8 shape ($0.44/hr) with 2 nvidia-tesla-t4 attached ($0.35 per GPU/hr) autoscaling up to 2 nodes = $2.28/hr +resource "google_container_node_pool" "gpu_t4x2_n1s8" { depends_on = [google_gke_hub_membership.cluster] # Variables cluster = google_container_cluster.mlp.name location = var.subnet_01_region - name = "gpu-l4x2-g2s24-dws" + name = "gpu_t4x2_n1s8" node_locations = [ "${var.subnet_01_region}-a", "${var.subnet_01_region}-b", @@ -500,7 +698,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_dws" { autoscaling { location_policy = "ANY" - total_max_node_count = 1000 + total_max_node_count = 2 total_min_node_count = 0 } @@ -523,14 +721,14 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_dws" { node_config { # Variables labels = { - "resource-model" : "l4" + "resource-model" : "t4" "resource-type" : "gpu" "gke-no-default-nvidia-gpu-device-plugin" : "true", "nvidia.com/gpu" : "present", "nvidia.com/dra.kubelet-plugin" : "true" "nvidia.com/dra.controller" : "true" } - machine_type = "g2-standard-24" + machine_type = "n1-standard-8" service_account = google_service_account.cluster.email oauth_scopes = [ "https://www.googleapis.com/auth/cloud-platform", @@ -544,7 +742,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_dws" { guest_accelerator { count = 2 - type = "nvidia-l4" + type = "nvidia-tesla-t4" gpu_driver_installation_config { gpu_driver_version = var.gpu_driver_version @@ -571,10 +769,6 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_dws" { } } - queued_provisioning { - enabled = true - } - timeouts { create = "30m" update = "20m" @@ -583,13 +777,14 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_dws" { ############################################################################### -resource "google_container_node_pool" "gpu_l4x2_g2s24_spot" { +# n1s8 shape ($0.44/hr) with 2 nvidia-tesla-v100 attached ($2.48 per GPU) autoscaling up to 2 nodes = $10.80/hr +resource "google_container_node_pool" "gpu_v100x2_n1s8" { depends_on = [google_gke_hub_membership.cluster] # Variables cluster = google_container_cluster.mlp.name location = var.subnet_01_region - name = "gpu-l4x2-g2s24-spot" + name = "gpu_v100x2_n1s8" node_locations = [ "${var.subnet_01_region}-a", "${var.subnet_01_region}-b", @@ -597,10 +792,9 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_spot" { ] project = data.google_project.environment.project_id - # Blocks autoscaling { location_policy = "ANY" - total_max_node_count = 1000 + total_max_node_count = 2 total_min_node_count = 0 } @@ -623,20 +817,19 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_spot" { node_config { # Variables labels = { - "resource-model" : "l4" + "resource-model" : "v100" "resource-type" : "gpu" "gke-no-default-nvidia-gpu-device-plugin" : "true", "nvidia.com/gpu" : "present", "nvidia.com/dra.kubelet-plugin" : "true" "nvidia.com/dra.controller" : "true" } - machine_type = "g2-standard-24" + machine_type = "n1-standard-8" service_account = google_service_account.cluster.email oauth_scopes = [ "https://www.googleapis.com/auth/cloud-platform", "https://www.googleapis.com/auth/devstorage.read_only" ] - spot = true # Blocks gcfs_config { @@ -645,7 +838,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_spot" { guest_accelerator { count = 2 - type = "nvidia-l4" + type = "nvidia-tesla-v100" gpu_driver_installation_config { gpu_driver_version = var.gpu_driver_version @@ -667,7 +860,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_spot" { taint { effect = "NO_SCHEDULE" - key = "spot" + key = "on-demand" value = true } } @@ -678,8 +871,6 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_spot" { } } - - # TPU # Available zones: https://cloud.google.com/tpu/docs/regions-zones ############################################################################### diff --git a/terraform/modules/cluster/gke.tf b/terraform/modules/cluster/gke.tf index 7ad6d3fb..7e2d198b 100644 --- a/terraform/modules/cluster/gke.tf +++ b/terraform/modules/cluster/gke.tf @@ -202,7 +202,7 @@ resource "google_container_cluster" "mlp" { private_cluster_config { enable_private_nodes = true - enable_private_endpoint = true + enable_private_endpoint = false master_ipv4_cidr_block = "172.16.0.32/28" }