From 0b302b319fac66b8a782ae0ea64ee4ac21ba96ed Mon Sep 17 00:00:00 2001
From: Laura Lorenz <lauralorenz@google.com>
Date: Wed, 9 Oct 2024 21:56:51 +0000
Subject: [PATCH 1/2] Add changes for alpha, node types, nvidia driver

---
 platforms/gke-aiml/playground/configsync.tf   |  25 ++++
 .../gke-aiml/playground/container_cluster.tf  |  21 ++-
 .../playground/container_node_pool.tf         |  88 ++++++++++--
 .../scripts/nvidia_dra_driver_manifests.sh    |  58 ++++++++
 .../_cluster_template/dra/kustomization.yaml  |  17 +++
 .../dra/nvidia-dra-drivers/kustomization.yaml |  25 ++++
 ...-driver-installer-daemonset-preloaded.yaml | 126 ++++++++++++++++++
 ...r-installer-prepare-gke-nodes-for-dra.yaml |  57 ++++++++
 .../dra/nvidia-dra-drivers/values.yaml        |  38 ++++++
 .../_cluster_template/namespace-nvidia.yaml   |  18 +++
 terraform/modules/cluster/gke.tf              |   8 +-
 terraform/modules/node-pools/nodepools.tf     |   3 +-
 12 files changed, 461 insertions(+), 23 deletions(-)
 create mode 100755 platforms/gke-aiml/playground/scripts/nvidia_dra_driver_manifests.sh
 create mode 100644 platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/kustomization.yaml
 create mode 100644 platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/kustomization.yaml
 create mode 100644 platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/nvidia-driver-installer-daemonset-preloaded.yaml
 create mode 100644 platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/nvidia-driver-installer-prepare-gke-nodes-for-dra.yaml
 create mode 100644 platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/values.yaml
 create mode 100644 platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/namespace-nvidia.yaml

diff --git a/platforms/gke-aiml/playground/configsync.tf b/platforms/gke-aiml/playground/configsync.tf
index 41a7457b..3f6f8923 100644
--- a/platforms/gke-aiml/playground/configsync.tf
+++ b/platforms/gke-aiml/playground/configsync.tf
@@ -161,6 +161,31 @@ resource "null_resource" "kueue" {
 #   }
 # }
 
+# NVIDIA DRA DRIVER
+###############################################################################
+resource "null_resource" "nvidia_dra_driver" {
+  depends_on = [
+    google_gke_hub_feature_membership.cluster_configmanagement,
+    google_secret_manager_secret_version.git_config,
+    module.configsync_repository,
+  ]
+
+    provisioner "local-exec" {
+    command = "${path.module}/scripts/nvidia_dra_driver_manifests.sh"
+    environment = {
+      GIT_CONFIG_SECRET_NAME = local.git_config_secret_name
+      GIT_REPOSITORY         = local.git_repository
+      MANIFESTS_DIRECTORY    = local.configsync_manifests_directory
+      PROJECT_ID             = data.google_project.environment.project_id
+      MLP_AR_REPO_URL        = "${google_artifact_registry_repository.container_images.location}-docker.pkg.dev/${google_artifact_registry_repository.container_images.project}/${var.environment_name}/k8s-dra-driver:v0.1.0"
+    }
+  }
+
+  triggers = {
+    md5_files  = md5(join("", [for f in fileset("${path.module}/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers", "**") : md5("${path.module}/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/${f}")]))
+    md5_script = filemd5("${path.module}/scripts/nvidia_dra_driver_manifests.sh")
+  }
+}
 
 
 # KUBERAY MANIFESTS
diff --git a/platforms/gke-aiml/playground/container_cluster.tf b/platforms/gke-aiml/playground/container_cluster.tf
index a9bbb5ee..041d8cad 100644
--- a/platforms/gke-aiml/playground/container_cluster.tf
+++ b/platforms/gke-aiml/playground/container_cluster.tf
@@ -54,6 +54,7 @@ resource "google_container_cluster" "mlp" {
   project                  = data.google_project.environment.project_id
   remove_default_node_pool = false
   subnetwork               = module.create-vpc.subnet-1
+  enable_kubernetes_alpha = true
 
   addons_config {
     gcp_filestore_csi_driver_config {
@@ -74,15 +75,18 @@ resource "google_container_cluster" "mlp" {
     enabled             = true
 
     auto_provisioning_defaults {
-      disk_type = "pd-balanced"
+      disk_type = "pd-standard"
+      disk_size = 100
+      image_type = "UBUNTU_CONTAINERD"
       oauth_scopes = [
-        "https://www.googleapis.com/auth/cloud-platform"
+        "https://www.googleapis.com/auth/cloud-platform",
+        "https://www.googleapis.com/auth/devstorage.read_only"
       ]
       service_account = google_service_account.cluster.email
 
       management {
-        auto_repair  = true
-        auto_upgrade = true
+        auto_repair  = false
+        auto_upgrade = false
       }
 
       shielded_instance_config {
@@ -221,11 +225,17 @@ resource "google_container_cluster" "mlp" {
       enable_private_nodes = true
     }
 
+    management {
+        auto_repair  = false
+        auto_upgrade = false
+      }
+
     node_config {
       machine_type    = "e2-standard-4"
       service_account = google_service_account.cluster.email
       oauth_scopes = [
-        "https://www.googleapis.com/auth/cloud-platform"
+        "https://www.googleapis.com/auth/cloud-platform",
+        "https://www.googleapis.com/auth/devstorage.read_only"
       ]
 
       gcfs_config {
@@ -245,6 +255,7 @@ resource "google_container_cluster" "mlp" {
         enabled = true
       }
     }
+
   }
 
   private_cluster_config {
diff --git a/platforms/gke-aiml/playground/container_node_pool.tf b/platforms/gke-aiml/playground/container_node_pool.tf
index 28a637b0..8362abbd 100644
--- a/platforms/gke-aiml/playground/container_node_pool.tf
+++ b/platforms/gke-aiml/playground/container_node_pool.tf
@@ -36,6 +36,11 @@ resource "google_container_node_pool" "cpu_n4s8" {
     total_min_node_count = 1
   }
 
+  management {
+        auto_repair  = false
+        auto_upgrade = false
+      }
+
   network_config {
     enable_private_nodes = true
   }
@@ -49,7 +54,8 @@ resource "google_container_node_pool" "cpu_n4s8" {
     machine_type    = "n4-standard-8"
     service_account = google_service_account.cluster.email
     oauth_scopes = [
-      "https://www.googleapis.com/auth/cloud-platform"
+      "https://www.googleapis.com/auth/cloud-platform",
+      "https://www.googleapis.com/auth/devstorage.read_only"
     ]
 
     # Blocks
@@ -78,16 +84,18 @@ resource "google_container_node_pool" "cpu_n4s8" {
   }
 
   timeouts {
-    create = "30m"
-    update = "20m"
+    create = "1m"
+    update = "1m"
+    delete = "1m"
   }
 }
 
 
 
-# GPU
+# GPU node shapes
 # Available zones: https://cloud.google.com/compute/docs/gpus/gpu-regions-zones#view-using-table
-###############################################################################
+##############################################################################
+# TODO: don't need for DRA demo
 resource "google_container_node_pool" "gpu_a100x2_a2h2" {
   depends_on = [google_gke_hub_membership.cluster]
 
@@ -110,6 +118,11 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2" {
     total_min_node_count = 0
   }
 
+  management {
+        auto_repair  = false
+        auto_upgrade = false
+      }
+
   lifecycle {
     ignore_changes = [
       node_config[0].labels,
@@ -130,7 +143,8 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2" {
     machine_type    = "a2-highgpu-2g"
     service_account = google_service_account.cluster.email
     oauth_scopes = [
-      "https://www.googleapis.com/auth/cloud-platform"
+      "https://www.googleapis.com/auth/cloud-platform",
+      "https://www.googleapis.com/auth/devstorage.read_only"
     ]
 
     # Blocks
@@ -173,8 +187,8 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2" {
   }
 }
 
-###############################################################################
-
+##############################################################################
+# TODO: don't need for DRA demo
 resource "google_container_node_pool" "gpu_a100x2_a2h2_dws" {
   depends_on = [google_gke_hub_membership.cluster]
 
@@ -197,6 +211,11 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2_dws" {
     total_min_node_count = 0
   }
 
+  management {
+        auto_repair  = false
+        auto_upgrade = false
+      }
+
   lifecycle {
     ignore_changes = [
       node_config[0].labels,
@@ -217,7 +236,8 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2_dws" {
     machine_type    = "a2-highgpu-2g"
     service_account = google_service_account.cluster.email
     oauth_scopes = [
-      "https://www.googleapis.com/auth/cloud-platform"
+      "https://www.googleapis.com/auth/cloud-platform",
+      "https://www.googleapis.com/auth/devstorage.read_only"
     ]
 
     # Blocks
@@ -286,6 +306,11 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_dws" {
     total_min_node_count = 0
   }
 
+  management {
+        auto_repair  = false
+        auto_upgrade = false
+      }
+
   lifecycle {
     ignore_changes = [
       node_config[0].labels,
@@ -301,12 +326,17 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_dws" {
     # Variables
     labels = {
       "resource-model" : "h100"
-      "resource-type" : "gpu"
+      "resource-type" : "gpu",
+      "gke-no-default-nvidia-gpu-device-plugin" : "true",
+      "nvidia.com/gpu" : "present",
+      "nvidia.com/dra.kubelet-plugin" : "true"
+      "nvidia.com/dra.controller" : "true"
     }
     machine_type    = "a3-highgpu-8g"
     service_account = google_service_account.cluster.email
     oauth_scopes = [
-      "https://www.googleapis.com/auth/cloud-platform"
+      "https://www.googleapis.com/auth/cloud-platform",
+      "https://www.googleapis.com/auth/devstorage.read_only"
     ]
 
     # Blocks
@@ -379,6 +409,11 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24" {
     total_min_node_count = 0
   }
 
+  management {
+        auto_repair  = false
+        auto_upgrade = false
+      }
+
   lifecycle {
     ignore_changes = [
       node_config[0].labels,
@@ -395,11 +430,16 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24" {
     labels = {
       "resource-model" : "l4"
       "resource-type" : "gpu"
+      "gke-no-default-nvidia-gpu-device-plugin" : "true",
+      "nvidia.com/gpu" : "present",
+      "nvidia.com/dra.kubelet-plugin" : "true"
+      "nvidia.com/dra.controller" : "true"
     }
     machine_type    = "g2-standard-24"
     service_account = google_service_account.cluster.email
     oauth_scopes = [
-      "https://www.googleapis.com/auth/cloud-platform"
+      "https://www.googleapis.com/auth/cloud-platform",
+      "https://www.googleapis.com/auth/devstorage.read_only"
     ]
 
     # Blocks
@@ -464,6 +504,11 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_dws" {
     total_min_node_count = 0
   }
 
+  management {
+        auto_repair  = false
+        auto_upgrade = false
+      }
+
   lifecycle {
     ignore_changes = [
       node_config[0].labels,
@@ -480,11 +525,16 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_dws" {
     labels = {
       "resource-model" : "l4"
       "resource-type" : "gpu"
+      "gke-no-default-nvidia-gpu-device-plugin" : "true",
+      "nvidia.com/gpu" : "present",
+      "nvidia.com/dra.kubelet-plugin" : "true"
+      "nvidia.com/dra.controller" : "true"
     }
     machine_type    = "g2-standard-24"
     service_account = google_service_account.cluster.email
     oauth_scopes = [
-      "https://www.googleapis.com/auth/cloud-platform"
+      "https://www.googleapis.com/auth/cloud-platform",
+      "https://www.googleapis.com/auth/devstorage.read_only"
     ]
 
     # Blocks
@@ -554,6 +604,11 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_spot" {
     total_min_node_count = 0
   }
 
+  management {
+        auto_repair  = false
+        auto_upgrade = false
+      }
+
   lifecycle {
     ignore_changes = [
       node_config[0].labels,
@@ -570,11 +625,16 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_spot" {
     labels = {
       "resource-model" : "l4"
       "resource-type" : "gpu"
+      "gke-no-default-nvidia-gpu-device-plugin" : "true",
+      "nvidia.com/gpu" : "present",
+      "nvidia.com/dra.kubelet-plugin" : "true"
+      "nvidia.com/dra.controller" : "true"
     }
     machine_type    = "g2-standard-24"
     service_account = google_service_account.cluster.email
     oauth_scopes = [
-      "https://www.googleapis.com/auth/cloud-platform"
+      "https://www.googleapis.com/auth/cloud-platform",
+      "https://www.googleapis.com/auth/devstorage.read_only"
     ]
     spot = true
 
diff --git a/platforms/gke-aiml/playground/scripts/nvidia_dra_driver_manifests.sh b/platforms/gke-aiml/playground/scripts/nvidia_dra_driver_manifests.sh
new file mode 100755
index 00000000..2b23ddce
--- /dev/null
+++ b/platforms/gke-aiml/playground/scripts/nvidia_dra_driver_manifests.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+#
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -u
+
+SCRIPT_PATH="$(
+    cd "$(dirname "$0")" >/dev/null 2>&1
+    pwd -P
+)"
+
+source ${SCRIPT_PATH}/helpers/clone_git_repo.sh
+
+# Set directory and path variables
+clusters_directory="manifests/clusters"
+clusters_path="${GIT_REPOSITORY_PATH}/${clusters_directory}"
+cluster_template_directory="templates/_cluster_template"
+cluster_template_path="${GIT_REPOSITORY_PATH}/${cluster_template_directory}"
+repo_container_images_url="${MLP_AR_REPO_URL}"
+
+cd "${clusters_path}" || {
+    echo "Clusters directory '${clusters_directory}' does not exist"
+    exit 100
+}
+
+echo "'${repo_container_images_url}'"
+
+cp -pr ${cluster_template_path}/dra/nvidia-dra-drivers ${clusters_path}/
+cp -pr ${cluster_template_path}/namespace-nvidia.yaml ${clusters_path}/
+
+# Added entries to the kustomization file
+resources=$(find ${clusters_path} -maxdepth 1 -mindepth 1 -type d | sort)
+resources+=" "
+export resources+=$(find ${clusters_path} -maxdepth 1 -type f -name "*.yaml" ! -name "kustomization.yaml" ! -name "*values.yaml" | sort)
+export kustomization_file="${clusters_path}/kustomization.yaml"
+source ${SCRIPT_PATH}/helpers/add_to_kustomization.sh
+
+#TODO: build the image here as well
+# something like `./${k8s-nvidia-driver-path}/demo/clusters/kind/scripts/build-driver-image.sh`
+docker tag nvcr.io/nvidia/cloud-native/k8s-dra-driver:v0.1.0 ${repo_container_images_url}
+docker push ${repo_container_images_url}
+
+# Add, commit, and push changes to the repository
+cd ${GIT_REPOSITORY_PATH}
+git add .
+git commit -m "Manifests for NVIDIA DRA"
+git push origin
diff --git a/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/kustomization.yaml b/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/kustomization.yaml
new file mode 100644
index 00000000..fe298fe7
--- /dev/null
+++ b/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/kustomization.yaml
@@ -0,0 +1,17 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
diff --git a/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/kustomization.yaml b/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/kustomization.yaml
new file mode 100644
index 00000000..f3bb902d
--- /dev/null
+++ b/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/kustomization.yaml
@@ -0,0 +1,25 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+# Another option is to use the NVIDIA Helm chart
+# helmCharts:
+# - name: k8s-dra-driver
+#   repo: https://github.com/NVIDIA/k8s-dra-driver/tree/main/deployments/helm
+#   version: 0.1.0
+#   releaseName: k8s-dra-driver
+#   includeCRDs: true
+#   valuesFile: ??.yaml
\ No newline at end of file
diff --git a/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/nvidia-driver-installer-daemonset-preloaded.yaml b/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/nvidia-driver-installer-daemonset-preloaded.yaml
new file mode 100644
index 00000000..7cb4c8d5
--- /dev/null
+++ b/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/nvidia-driver-installer-daemonset-preloaded.yaml
@@ -0,0 +1,126 @@
+# Copyright 2017 Google Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This daemonset installs nvidia driver 450.80.02 and invokes the
+# partition_gpu tool to enable MIG mode and create GPU instances as specified
+# in the GPU config.
+
+##### SOURCE: https://github.com/GoogleCloudPlatform/container-engine-accelerators/blob/master/nvidia-driver-installer/cos/daemonset-nvidia-mig.yaml
+### NEED UBUNTU? check https://github.com/GoogleCloudPlatform/container-engine-accelerators/blob/master/nvidia-driver-installer/ubuntu/daemonset.yaml
+
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: nvidia-driver-installer
+  namespace: kube-system
+  labels:
+    k8s-app: nvidia-driver-installer
+spec:
+  selector:
+    matchLabels:
+      k8s-app: nvidia-driver-installer
+  updateStrategy:
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        name: nvidia-driver-installer
+        k8s-app: nvidia-driver-installer
+    spec:
+      priorityClassName: system-node-critical
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: cloud.google.com/gke-accelerator
+                operator: Exists
+      tolerations:
+      - operator: "Exists"
+      hostNetwork: true
+      hostPID: true
+      volumes:
+      - name: dev
+        hostPath:
+          path: /dev
+      - name: vulkan-icd-mount
+        hostPath:
+          path: /home/kubernetes/bin/nvidia/vulkan/icd.d
+      - name: nvidia-install-dir-host
+        hostPath:
+          path: /home/kubernetes/bin/nvidia
+      - name: root-mount
+        hostPath:
+          path: /
+      - name: cos-tools
+        hostPath:
+          path: /var/lib/cos-tools
+      - name: nvidia-config
+        hostPath:
+          path: /etc/nvidia
+      initContainers:
+      - image: "cos-nvidia-installer:fixed"
+        imagePullPolicy: Never
+        name: nvidia-driver-installer
+        resources:
+          requests:
+            cpu: 150m
+        securityContext:
+          privileged: true
+        env:
+          - name: NVIDIA_INSTALL_DIR_HOST
+            value: /home/kubernetes/bin/nvidia
+          - name: NVIDIA_INSTALL_DIR_CONTAINER
+            value: /usr/local/nvidia
+          - name: VULKAN_ICD_DIR_HOST
+            value: /home/kubernetes/bin/nvidia/vulkan/icd.d
+          - name: VULKAN_ICD_DIR_CONTAINER
+            value: /etc/vulkan/icd.d
+          - name: ROOT_MOUNT_DIR
+            value: /root
+          - name: COS_TOOLS_DIR_HOST
+            value: /var/lib/cos-tools
+          - name: COS_TOOLS_DIR_CONTAINER
+            value: /build/cos-tools
+        volumeMounts:
+        - name: nvidia-install-dir-host
+          mountPath: /usr/local/nvidia
+        - name: vulkan-icd-mount
+          mountPath: /etc/vulkan/icd.d
+        - name: dev
+          mountPath: /dev
+        - name: root-mount
+          mountPath: /root
+        - name: cos-tools
+          mountPath: /build/cos-tools
+      - image: "gcr.io/gke-release/nvidia-partition-gpu@sha256:e226275da6c45816959fe43cde907ee9a85c6a2aa8a429418a4cadef8ecdb86a"
+        name: partition-gpus
+        env:
+        - name: LD_LIBRARY_PATH
+          value: /usr/local/nvidia/lib64
+        resources:
+          requests:
+            cpu: 150m
+        securityContext:
+          privileged: true
+        volumeMounts:
+        - name: nvidia-install-dir-host
+          mountPath: /usr/local/nvidia
+        - name: dev
+          mountPath: /dev
+        - name: nvidia-config
+          mountPath: /etc/nvidia
+      containers:
+      - image: "registry.k8s.io/pause:3.9"
+        name: pause
\ No newline at end of file
diff --git a/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/nvidia-driver-installer-prepare-gke-nodes-for-dra.yaml b/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/nvidia-driver-installer-prepare-gke-nodes-for-dra.yaml
new file mode 100644
index 00000000..fbcff002
--- /dev/null
+++ b/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/nvidia-driver-installer-prepare-gke-nodes-for-dra.yaml
@@ -0,0 +1,57 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: prepare-gpu-node-for-dra
+  namespace: nvidia
+  labels:
+    app: prepare-gpu-node-for-dra
+spec:
+  selector:
+    matchLabels:
+      app: prepare-gpu-node-for-dra
+  template:
+    metadata:
+      labels:
+        app: prepare-gpu-node-for-dra
+    spec:
+      hostPID: true
+      hostIPC: true
+      nodeSelector:
+        nvidia.com/gpu: present
+      containers:
+      - image: ubuntu:22.04
+        name: ctr
+        command: ["bash", "-c"]
+        args:
+        - |-
+          chroot /host bash -c "until /opt/nvidia/bin/nvidia-smi; do :; done"
+          chroot /host bash -c "
+            curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | gpg --yes --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
+              && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
+                sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
+                tee /etc/apt/sources.list.d/nvidia-container-toolkit.list \
+              && \
+                apt-get -o DPkg::Lock::Timeout=60 update \
+              && \
+                apt-get -o DPkg::Lock::Timeout=60 install nvidia-container-toolkit-base";
+          chroot /host sed -i -e '/\[plugins."io.containerd.grpc.v1.cri"\]/a \ \ enable_cdi = true' /etc/containerd/config.toml;
+          chroot /host systemctl restart containerd;
+          sleep infinity;
+        securityContext:
+          privileged: true
+        volumeMounts:
+        - name: host-root
+          mountPath: /host
+        - name: host-sys
+          mountPath: /sys
+      volumes:
+      - name: host-root
+        hostPath:
+          path: /
+      - name: host-sys
+        hostPath:
+          path: /sys
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
\ No newline at end of file
diff --git a/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/values.yaml b/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/values.yaml
new file mode 100644
index 00000000..96921b21
--- /dev/null
+++ b/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/dra/nvidia-dra-drivers/values.yaml
@@ -0,0 +1,38 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# their old image
+# image:
+#   repository: ghcr.io/nvidia/k8s-dra-driver
+#   tag: 9323da2d-ubuntu20.04
+#   pullPolicy: Always
+
+# my image
+image:
+  repository: us-central1-docker.pkg.dev/lauralorenz-gke-dev/dra/k8s-dra-driver
+  tag: v0.1.0
+  pullPolicy: Always
+
+controller:
+  priorityClassName: ""
+
+kubeletPlugin:
+  priorityClassName: ""
+  tolerations:
+  - key: "nvidia.com/gpu"
+    operator: "Exists"
+    effect: "NoSchedule"
+
+nvidiaDriverRoot: "/opt/nvidia"
\ No newline at end of file
diff --git a/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/namespace-nvidia.yaml b/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/namespace-nvidia.yaml
new file mode 100644
index 00000000..9c8da2e7
--- /dev/null
+++ b/platforms/gke-aiml/playground/templates/configsync/templates/_cluster_template/namespace-nvidia.yaml
@@ -0,0 +1,18 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: nvidia
diff --git a/terraform/modules/cluster/gke.tf b/terraform/modules/cluster/gke.tf
index e046e697..7ad6d3fb 100644
--- a/terraform/modules/cluster/gke.tf
+++ b/terraform/modules/cluster/gke.tf
@@ -32,6 +32,7 @@ resource "google_container_cluster" "mlp" {
   project                  = var.project_id
   remove_default_node_pool = var.remove_default_node_pool
   subnetwork               = var.subnet
+  enable_kubernetes_alpha = true
 
   addons_config {
     gcp_filestore_csi_driver_config {
@@ -54,12 +55,13 @@ resource "google_container_cluster" "mlp" {
     auto_provisioning_defaults {
       service_account = var.service_account
       oauth_scopes = [
-        "https://www.googleapis.com/auth/cloud-platform"
+        "https://www.googleapis.com/auth/cloud-platform",
+        "https://www.googleapis.com/auth/devstorage.read_only"
       ]
 
       management {
-        auto_repair  = true
-        auto_upgrade = true
+        auto_repair  = false
+        auto_upgrade = false
       }
 
       shielded_instance_config {
diff --git a/terraform/modules/node-pools/nodepools.tf b/terraform/modules/node-pools/nodepools.tf
index 79b47ef0..fcf2e5c6 100644
--- a/terraform/modules/node-pools/nodepools.tf
+++ b/terraform/modules/node-pools/nodepools.tf
@@ -39,7 +39,8 @@ resource "google_container_node_pool" "node-pool" {
     machine_type    = var.machine_type
     service_account = var.service_account
     oauth_scopes = [
-      "https://www.googleapis.com/auth/cloud-platform"
+      "https://www.googleapis.com/auth/cloud-platform",
+      "https://www.googleapis.com/auth/devstorage.read_only"
     ]
 
     gcfs_config {

From 3f78ce7e047356a17f3174fcf903751d7be06a9e Mon Sep 17 00:00:00 2001
From: Laura Lorenz <lauralorenz@google.com>
Date: Wed, 9 Oct 2024 23:50:38 +0000
Subject: [PATCH 2/2] expose master endpoint, DRA specific nodes, help the cpu
 node

---
 .../gke-aiml/playground/container_cluster.tf  |   4 +-
 .../playground/container_node_pool.tf         | 461 +++++++++++++-----
 terraform/modules/cluster/gke.tf              |   2 +-
 3 files changed, 329 insertions(+), 138 deletions(-)

diff --git a/platforms/gke-aiml/playground/container_cluster.tf b/platforms/gke-aiml/playground/container_cluster.tf
index 041d8cad..25f24704 100644
--- a/platforms/gke-aiml/playground/container_cluster.tf
+++ b/platforms/gke-aiml/playground/container_cluster.tf
@@ -75,7 +75,7 @@ resource "google_container_cluster" "mlp" {
     enabled             = true
 
     auto_provisioning_defaults {
-      disk_type = "pd-standard"
+      disk_type = "pd-balanced"
       disk_size = 100
       image_type = "UBUNTU_CONTAINERD"
       oauth_scopes = [
@@ -260,7 +260,7 @@ resource "google_container_cluster" "mlp" {
 
   private_cluster_config {
     enable_private_nodes    = true
-    enable_private_endpoint = true
+    enable_private_endpoint = false
     master_ipv4_cidr_block  = "172.16.0.32/28"
   }
 
diff --git a/platforms/gke-aiml/playground/container_node_pool.tf b/platforms/gke-aiml/playground/container_node_pool.tf
index 8362abbd..9155901a 100644
--- a/platforms/gke-aiml/playground/container_node_pool.tf
+++ b/platforms/gke-aiml/playground/container_node_pool.tf
@@ -95,116 +95,314 @@ resource "google_container_node_pool" "cpu_n4s8" {
 # GPU node shapes
 # Available zones: https://cloud.google.com/compute/docs/gpus/gpu-regions-zones#view-using-table
 ##############################################################################
-# TODO: don't need for DRA demo
-resource "google_container_node_pool" "gpu_a100x2_a2h2" {
-  depends_on = [google_gke_hub_membership.cluster]
-
-  # Variables
-  cluster  = google_container_cluster.mlp.name
-  location = var.subnet_01_region
-  name     = "gpu-a100x2-a2h2"
-  node_locations = [
-    "${var.subnet_01_region}-a",
-    "${var.subnet_01_region}-b",
-    "${var.subnet_01_region}-c",
-    "${var.subnet_01_region}-f"
-  ]
-  project = data.google_project.environment.project_id
 
-  # Blocks
-  autoscaling {
-    location_policy      = "ANY"
-    total_max_node_count = 1000
-    total_min_node_count = 0
-  }
+# resource "google_container_node_pool" "gpu_a100x2_a2h2" {
+#   depends_on = [google_gke_hub_membership.cluster]
+
+#   # Variables
+#   cluster  = google_container_cluster.mlp.name
+#   location = var.subnet_01_region
+#   name     = "gpu-a100x2-a2h2"
+#   node_locations = [
+#     "${var.subnet_01_region}-a",
+#     "${var.subnet_01_region}-b",
+#     "${var.subnet_01_region}-c",
+#     "${var.subnet_01_region}-f"
+#   ]
+#   project = data.google_project.environment.project_id
+
+#   # Blocks
+#   autoscaling {
+#     location_policy      = "ANY"
+#     total_max_node_count = 1000
+#     total_min_node_count = 0
+#   }
+
+#   management {
+#         auto_repair  = false
+#         auto_upgrade = false
+#       }
+
+#   lifecycle {
+#     ignore_changes = [
+#       node_config[0].labels,
+#       node_config[0].taint,
+#     ]
+#   }
+
+#   network_config {
+#     enable_private_nodes = true
+#   }
+
+#   node_config {
+#     # Variables
+#     labels = {
+#       "resource-model" : "a100"
+#       "resource-type" : "gpu"
+#     }
+#     machine_type    = "a2-highgpu-2g"
+#     service_account = google_service_account.cluster.email
+#     oauth_scopes = [
+#       "https://www.googleapis.com/auth/cloud-platform",
+#       "https://www.googleapis.com/auth/devstorage.read_only"
+#     ]
+
+#     # Blocks
+#     gcfs_config {
+#       enabled = true
+#     }
+
+#     guest_accelerator {
+#       count = 2
+#       type  = "nvidia-tesla-a100"
+
+#       gpu_driver_installation_config {
+#         gpu_driver_version = var.gpu_driver_version
+#       }
+#     }
+
+#     gvnic {
+#       enabled = true
+#     }
+
+#     reservation_affinity {
+#       consume_reservation_type = "NO_RESERVATION"
+#     }
+
+#     shielded_instance_config {
+#       enable_integrity_monitoring = true
+#       enable_secure_boot          = true
+#     }
+
+#     taint {
+#       effect = "NO_SCHEDULE"
+#       key    = "on-demand"
+#       value  = true
+#     }
+#   }
+
+#   timeouts {
+#     create = "30m"
+#     update = "20m"
+#   }
+# }
 
-  management {
-        auto_repair  = false
-        auto_upgrade = false
-      }
-
-  lifecycle {
-    ignore_changes = [
-      node_config[0].labels,
-      node_config[0].taint,
-    ]
-  }
-
-  network_config {
-    enable_private_nodes = true
-  }
-
-  node_config {
-    # Variables
-    labels = {
-      "resource-model" : "a100"
-      "resource-type" : "gpu"
-    }
-    machine_type    = "a2-highgpu-2g"
-    service_account = google_service_account.cluster.email
-    oauth_scopes = [
-      "https://www.googleapis.com/auth/cloud-platform",
-      "https://www.googleapis.com/auth/devstorage.read_only"
-    ]
-
-    # Blocks
-    gcfs_config {
-      enabled = true
-    }
-
-    guest_accelerator {
-      count = 2
-      type  = "nvidia-tesla-a100"
-
-      gpu_driver_installation_config {
-        gpu_driver_version = var.gpu_driver_version
-      }
-    }
+##############################################################################
 
-    gvnic {
-      enabled = true
-    }
+# resource "google_container_node_pool" "gpu_a100x2_a2h2_dws" {
+#   depends_on = [google_gke_hub_membership.cluster]
+
+#   # Variables
+#   cluster  = google_container_cluster.mlp.name
+#   location = var.subnet_01_region
+#   name     = "gpu-a100x2-a2h2-dws"
+#   node_locations = [
+#     "${var.subnet_01_region}-a",
+#     "${var.subnet_01_region}-b",
+#     "${var.subnet_01_region}-c",
+#     "${var.subnet_01_region}-f"
+#   ]
+#   project = data.google_project.environment.project_id
+
+#   # Blocks
+#   autoscaling {
+#     location_policy      = "ANY"
+#     total_max_node_count = 1000
+#     total_min_node_count = 0
+#   }
+
+#   management {
+#         auto_repair  = false
+#         auto_upgrade = false
+#       }
+
+#   lifecycle {
+#     ignore_changes = [
+#       node_config[0].labels,
+#       node_config[0].taint,
+#     ]
+#   }
+
+#   network_config {
+#     enable_private_nodes = true
+#   }
+
+#   node_config {
+#     # Variables
+#     labels = {
+#       "resource-model" : "a100"
+#       "resource-type" : "gpu"
+#     }
+#     machine_type    = "a2-highgpu-2g"
+#     service_account = google_service_account.cluster.email
+#     oauth_scopes = [
+#       "https://www.googleapis.com/auth/cloud-platform",
+#       "https://www.googleapis.com/auth/devstorage.read_only"
+#     ]
+
+#     # Blocks
+#     gcfs_config {
+#       enabled = true
+#     }
+
+#     guest_accelerator {
+#       count = 2
+#       type  = "nvidia-tesla-a100"
+
+#       gpu_driver_installation_config {
+#         gpu_driver_version = var.gpu_driver_version
+#       }
+#     }
+
+#     gvnic {
+#       enabled = true
+#     }
+
+#     reservation_affinity {
+#       consume_reservation_type = "NO_RESERVATION"
+#     }
+
+#     shielded_instance_config {
+#       enable_integrity_monitoring = true
+#       enable_secure_boot          = true
+#     }
+
+#     taint {
+#       effect = "NO_SCHEDULE"
+#       key    = "on-demand"
+#       value  = true
+#     }
+#   }
+
+#   queued_provisioning {
+#     enabled = true
+#   }
+
+#   timeouts {
+#     create = "30m"
+#     update = "20m"
+#   }
+# }
 
-    reservation_affinity {
-      consume_reservation_type = "NO_RESERVATION"
-    }
+###############################################################################
 
-    shielded_instance_config {
-      enable_integrity_monitoring = true
-      enable_secure_boot          = true
-    }
+# resource "google_container_node_pool" "gpu_h100x8_a3h8_dws" {
+#   depends_on = [google_gke_hub_membership.cluster]
+
+#   # Variables
+#   cluster  = google_container_cluster.mlp.name
+#   location = var.subnet_01_region
+#   name     = "gpu-h100x8-a3h8-dws"
+#   node_locations = [
+#     "${var.subnet_01_region}-a",
+#     "${var.subnet_01_region}-c"
+#   ]
+#   project = data.google_project.environment.project_id
+
+#   # Blocks
+#   autoscaling {
+#     location_policy      = "ANY"
+#     total_max_node_count = 1000
+#     total_min_node_count = 0
+#   }
+
+#   management {
+#         auto_repair  = false
+#         auto_upgrade = false
+#       }
+
+#   lifecycle {
+#     ignore_changes = [
+#       node_config[0].labels,
+#       node_config[0].taint,
+#     ]
+#   }
+
+#   network_config {
+#     enable_private_nodes = true
+#   }
+
+#   node_config {
+#     # Variables
+#     labels = {
+#       "resource-model" : "h100"
+#       "resource-type" : "gpu",
+#       "gke-no-default-nvidia-gpu-device-plugin" : "true",
+#       "nvidia.com/gpu" : "present",
+#       "nvidia.com/dra.kubelet-plugin" : "true"
+#       "nvidia.com/dra.controller" : "true"
+#     }
+#     machine_type    = "a3-highgpu-8g"
+#     service_account = google_service_account.cluster.email
+#     oauth_scopes = [
+#       "https://www.googleapis.com/auth/cloud-platform",
+#       "https://www.googleapis.com/auth/devstorage.read_only"
+#     ]
+
+#     # Blocks
+#     ephemeral_storage_local_ssd_config {
+#       local_ssd_count = 16
+#     }
+
+#     gcfs_config {
+#       enabled = true
+#     }
+
+#     guest_accelerator {
+#       count = 8
+#       type  = "nvidia-h100-80gb"
+
+#       gpu_driver_installation_config {
+#         gpu_driver_version = var.gpu_driver_version
+#       }
+#     }
+
+#     gvnic {
+#       enabled = true
+#     }
+
+#     reservation_affinity {
+#       consume_reservation_type = "NO_RESERVATION"
+#     }
+
+#     shielded_instance_config {
+#       enable_integrity_monitoring = true
+#       enable_secure_boot          = true
+#     }
+
+#     taint {
+#       effect = "NO_SCHEDULE"
+#       key    = "on-demand"
+#       value  = true
+#     }
+#   }
+
+#   queued_provisioning {
+#     enabled = true
+#   }
+
+#   timeouts {
+#     create = "30m"
+#     update = "20m"
+#   }
+# }
 
-    taint {
-      effect = "NO_SCHEDULE"
-      key    = "on-demand"
-      value  = true
-    }
-  }
-
-  timeouts {
-    create = "30m"
-    update = "20m"
-  }
-}
+###############################################################################
 
-##############################################################################
-# TODO: don't need for DRA demo
-resource "google_container_node_pool" "gpu_a100x2_a2h2_dws" {
+resource "google_container_node_pool" "gpu_l4x2_g2s24" {
   depends_on = [google_gke_hub_membership.cluster]
 
   # Variables
   cluster  = google_container_cluster.mlp.name
   location = var.subnet_01_region
-  name     = "gpu-a100x2-a2h2-dws"
+  name     = "gpu-l4x2-g2s24"
   node_locations = [
     "${var.subnet_01_region}-a",
     "${var.subnet_01_region}-b",
-    "${var.subnet_01_region}-c",
-    "${var.subnet_01_region}-f"
+    "${var.subnet_01_region}-c"
   ]
   project = data.google_project.environment.project_id
 
-  # Blocks
   autoscaling {
     location_policy      = "ANY"
     total_max_node_count = 1000
@@ -230,10 +428,14 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2_dws" {
   node_config {
     # Variables
     labels = {
-      "resource-model" : "a100"
+      "resource-model" : "l4"
       "resource-type" : "gpu"
+      "gke-no-default-nvidia-gpu-device-plugin" : "true",
+      "nvidia.com/gpu" : "present",
+      "nvidia.com/dra.kubelet-plugin" : "true"
+      "nvidia.com/dra.controller" : "true"
     }
-    machine_type    = "a2-highgpu-2g"
+    machine_type    = "g2-standard-24"
     service_account = google_service_account.cluster.email
     oauth_scopes = [
       "https://www.googleapis.com/auth/cloud-platform",
@@ -247,7 +449,7 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2_dws" {
 
     guest_accelerator {
       count = 2
-      type  = "nvidia-tesla-a100"
+      type  = "nvidia-l4"
 
       gpu_driver_installation_config {
         gpu_driver_version = var.gpu_driver_version
@@ -274,10 +476,6 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2_dws" {
     }
   }
 
-  queued_provisioning {
-    enabled = true
-  }
-
   timeouts {
     create = "30m"
     update = "20m"
@@ -286,20 +484,20 @@ resource "google_container_node_pool" "gpu_a100x2_a2h2_dws" {
 
 ###############################################################################
 
-resource "google_container_node_pool" "gpu_h100x8_a3h8_dws" {
+resource "google_container_node_pool" "gpu_l4x2_g2s24_dws" {
   depends_on = [google_gke_hub_membership.cluster]
 
   # Variables
   cluster  = google_container_cluster.mlp.name
   location = var.subnet_01_region
-  name     = "gpu-h100x8-a3h8-dws"
+  name     = "gpu-l4x2-g2s24-dws"
   node_locations = [
     "${var.subnet_01_region}-a",
+    "${var.subnet_01_region}-b",
     "${var.subnet_01_region}-c"
   ]
   project = data.google_project.environment.project_id
 
-  # Blocks
   autoscaling {
     location_policy      = "ANY"
     total_max_node_count = 1000
@@ -325,14 +523,14 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_dws" {
   node_config {
     # Variables
     labels = {
-      "resource-model" : "h100"
-      "resource-type" : "gpu",
+      "resource-model" : "l4"
+      "resource-type" : "gpu"
       "gke-no-default-nvidia-gpu-device-plugin" : "true",
       "nvidia.com/gpu" : "present",
       "nvidia.com/dra.kubelet-plugin" : "true"
       "nvidia.com/dra.controller" : "true"
     }
-    machine_type    = "a3-highgpu-8g"
+    machine_type    = "g2-standard-24"
     service_account = google_service_account.cluster.email
     oauth_scopes = [
       "https://www.googleapis.com/auth/cloud-platform",
@@ -340,17 +538,13 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_dws" {
     ]
 
     # Blocks
-    ephemeral_storage_local_ssd_config {
-      local_ssd_count = 16
-    }
-
     gcfs_config {
       enabled = true
     }
 
     guest_accelerator {
-      count = 8
-      type  = "nvidia-h100-80gb"
+      count = 2
+      type  = "nvidia-l4"
 
       gpu_driver_installation_config {
         gpu_driver_version = var.gpu_driver_version
@@ -389,13 +583,13 @@ resource "google_container_node_pool" "gpu_h100x8_a3h8_dws" {
 
 ###############################################################################
 
-resource "google_container_node_pool" "gpu_l4x2_g2s24" {
+resource "google_container_node_pool" "gpu_l4x2_g2s24_spot" {
   depends_on = [google_gke_hub_membership.cluster]
 
   # Variables
   cluster  = google_container_cluster.mlp.name
   location = var.subnet_01_region
-  name     = "gpu-l4x2-g2s24"
+  name     = "gpu-l4x2-g2s24-spot"
   node_locations = [
     "${var.subnet_01_region}-a",
     "${var.subnet_01_region}-b",
@@ -403,6 +597,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24" {
   ]
   project = data.google_project.environment.project_id
 
+  # Blocks
   autoscaling {
     location_policy      = "ANY"
     total_max_node_count = 1000
@@ -441,6 +636,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24" {
       "https://www.googleapis.com/auth/cloud-platform",
       "https://www.googleapis.com/auth/devstorage.read_only"
     ]
+    spot = true
 
     # Blocks
     gcfs_config {
@@ -471,7 +667,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24" {
 
     taint {
       effect = "NO_SCHEDULE"
-      key    = "on-demand"
+      key    = "spot"
       value  = true
     }
   }
@@ -482,15 +678,17 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24" {
   }
 }
 
+### Attached GPUs
 ###############################################################################
 
-resource "google_container_node_pool" "gpu_l4x2_g2s24_dws" {
+# n1s8 shape ($0.44/hr) with 2 nvidia-tesla-t4 attached ($0.35 per GPU/hr) autoscaling up to 2 nodes = $2.28/hr
+resource "google_container_node_pool" "gpu_t4x2_n1s8" {
   depends_on = [google_gke_hub_membership.cluster]
 
   # Variables
   cluster  = google_container_cluster.mlp.name
   location = var.subnet_01_region
-  name     = "gpu-l4x2-g2s24-dws"
+  name     = "gpu_t4x2_n1s8"
   node_locations = [
     "${var.subnet_01_region}-a",
     "${var.subnet_01_region}-b",
@@ -500,7 +698,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_dws" {
 
   autoscaling {
     location_policy      = "ANY"
-    total_max_node_count = 1000
+    total_max_node_count = 2
     total_min_node_count = 0
   }
 
@@ -523,14 +721,14 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_dws" {
   node_config {
     # Variables
     labels = {
-      "resource-model" : "l4"
+      "resource-model" : "t4"
       "resource-type" : "gpu"
       "gke-no-default-nvidia-gpu-device-plugin" : "true",
       "nvidia.com/gpu" : "present",
       "nvidia.com/dra.kubelet-plugin" : "true"
       "nvidia.com/dra.controller" : "true"
     }
-    machine_type    = "g2-standard-24"
+    machine_type    = "n1-standard-8"
     service_account = google_service_account.cluster.email
     oauth_scopes = [
       "https://www.googleapis.com/auth/cloud-platform",
@@ -544,7 +742,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_dws" {
 
     guest_accelerator {
       count = 2
-      type  = "nvidia-l4"
+      type  = "nvidia-tesla-t4"
 
       gpu_driver_installation_config {
         gpu_driver_version = var.gpu_driver_version
@@ -571,10 +769,6 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_dws" {
     }
   }
 
-  queued_provisioning {
-    enabled = true
-  }
-
   timeouts {
     create = "30m"
     update = "20m"
@@ -583,13 +777,14 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_dws" {
 
 ###############################################################################
 
-resource "google_container_node_pool" "gpu_l4x2_g2s24_spot" {
+# n1s8 shape ($0.44/hr) with 2 nvidia-tesla-v100 attached ($2.48 per GPU) autoscaling up to 2 nodes = $10.80/hr
+resource "google_container_node_pool" "gpu_v100x2_n1s8" {
   depends_on = [google_gke_hub_membership.cluster]
 
   # Variables
   cluster  = google_container_cluster.mlp.name
   location = var.subnet_01_region
-  name     = "gpu-l4x2-g2s24-spot"
+  name     = "gpu_v100x2_n1s8"
   node_locations = [
     "${var.subnet_01_region}-a",
     "${var.subnet_01_region}-b",
@@ -597,10 +792,9 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_spot" {
   ]
   project = data.google_project.environment.project_id
 
-  # Blocks
   autoscaling {
     location_policy      = "ANY"
-    total_max_node_count = 1000
+    total_max_node_count = 2
     total_min_node_count = 0
   }
 
@@ -623,20 +817,19 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_spot" {
   node_config {
     # Variables
     labels = {
-      "resource-model" : "l4"
+      "resource-model" : "v100"
       "resource-type" : "gpu"
       "gke-no-default-nvidia-gpu-device-plugin" : "true",
       "nvidia.com/gpu" : "present",
       "nvidia.com/dra.kubelet-plugin" : "true"
       "nvidia.com/dra.controller" : "true"
     }
-    machine_type    = "g2-standard-24"
+    machine_type    = "n1-standard-8"
     service_account = google_service_account.cluster.email
     oauth_scopes = [
       "https://www.googleapis.com/auth/cloud-platform",
       "https://www.googleapis.com/auth/devstorage.read_only"
     ]
-    spot = true
 
     # Blocks
     gcfs_config {
@@ -645,7 +838,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_spot" {
 
     guest_accelerator {
       count = 2
-      type  = "nvidia-l4"
+      type  = "nvidia-tesla-v100"
 
       gpu_driver_installation_config {
         gpu_driver_version = var.gpu_driver_version
@@ -667,7 +860,7 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_spot" {
 
     taint {
       effect = "NO_SCHEDULE"
-      key    = "spot"
+      key    = "on-demand"
       value  = true
     }
   }
@@ -678,8 +871,6 @@ resource "google_container_node_pool" "gpu_l4x2_g2s24_spot" {
   }
 }
 
-
-
 # TPU
 # Available zones: https://cloud.google.com/tpu/docs/regions-zones
 ###############################################################################
diff --git a/terraform/modules/cluster/gke.tf b/terraform/modules/cluster/gke.tf
index 7ad6d3fb..7e2d198b 100644
--- a/terraform/modules/cluster/gke.tf
+++ b/terraform/modules/cluster/gke.tf
@@ -202,7 +202,7 @@ resource "google_container_cluster" "mlp" {
 
   private_cluster_config {
     enable_private_nodes    = true
-    enable_private_endpoint = true
+    enable_private_endpoint = false
     master_ipv4_cidr_block  = "172.16.0.32/28"
   }