From 7948faa51dc417a23eb1989dab6bca3ae5a318fc Mon Sep 17 00:00:00 2001
From: Christian Bergen <cbergen@dspace.de>
Date: Thu, 28 Nov 2024 09:55:00 +0100
Subject: [PATCH 01/29] add another ami for gpu usage which supports using GPU
 operator

---
 locals.tf | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/locals.tf b/locals.tf
index f58774d..3929415 100644
--- a/locals.tf
+++ b/locals.tf
@@ -3,7 +3,7 @@ data "aws_ami" "al2gpu_ami" {
   most_recent = true
   filter {
     name   = "name"
-    values = ["*amazon-eks-gpu-node-${var.kubernetesVersion}*"]
+    values = ["*ubuntu-eks/k8s_${var.kubernetesVersion}/images/hvm-ssd/ubuntu-jammy-22.04-amd64-server*"]
   }
 }
 
@@ -30,7 +30,6 @@ locals {
   private_subnets                           = local.create_vpc ? module.vpc[0].private_subnets : (local.use_private_subnets_ids ? var.private_subnet_ids : [for s in data.aws_subnet.private_subnet : s.id])
   public_subnets                            = local.create_vpc ? module.vpc[0].public_subnets : (local.use_public_subnet_ids ? var.public_subnet_ids : [for s in data.aws_subnet.public_subnet : s.id])
   # Using a one-line command for gpuPostUserData to avoid issues due to different line endings between Windows and Linux.
-  gpuPostUserData = "sudo yum -y erase nvidia-driver \nsudo yum -y install make gcc \nsudo yum -y update \nsudo yum -y install gcc kernel-devel-$(uname -r) \nsudo curl -fSsl -O https://us.download.nvidia.com/tesla/${var.gpuNvidiaDriverVersion}/NVIDIA-Linux-x86_64-${var.gpuNvidiaDriverVersion}.run \nsudo chmod +x NVIDIA-Linux-x86_64*.run \nsudo CC=/usr/bin/gcc10-cc ./NVIDIA-Linux-x86_64*.run -s --no-dkms --install-libglvnd \nsudo touch /etc/modprobe.d/nvidia.conf \necho \"options nvidia NVreg_EnableGpuFirmware=0\" | sudo tee --append /etc/modprobe.d/nvidia.conf \nsudo reboot"
 
   default_managed_node_pools = {
     "default" = {
@@ -74,7 +73,12 @@ locals {
       disk_size              = var.gpuNodeDiskSize
       custom_ami_id          = data.aws_ami.al2gpu_ami.image_id
       create_launch_template = true
-      post_userdata          = local.gpuPostUserData
+      block_device_mappings = [{
+        device_name           = "/dev/sda1"
+        volume_type           = "gp2"
+        volume_size           = 128
+        delete_on_termination = true
+      }]
       k8s_labels = {
         "purpose" = "gpu"
       }
@@ -99,7 +103,12 @@ locals {
       disk_size              = var.ivsGpuNodeDiskSize
       custom_ami_id          = data.aws_ami.al2gpu_ami.image_id
       create_launch_template = true
-      post_userdata          = local.gpuPostUserData
+      block_device_mappings = [{
+        device_name           = "/dev/sda1"
+        volume_type           = "gp2"
+        volume_size           = 128
+        delete_on_termination = true
+      }]
       k8s_labels = {
         "product" = "ivs",
         "purpose" = "gpu"

From dfb0fbb087b38a4eb16a99f3f703408437fe21fa Mon Sep 17 00:00:00 2001
From: Christian Bergen <cbergen@dspace.de>
Date: Thu, 28 Nov 2024 13:33:48 +0100
Subject: [PATCH 02/29] remove variable gpuNvidiaDriverVersion

---
 variables.tf | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/variables.tf b/variables.tf
index 7149dc8..05e191b 100644
--- a/variables.tf
+++ b/variables.tf
@@ -118,12 +118,6 @@ variable "ivsGpuNodeDiskSize" {
   default     = 100
 }
 
-variable "gpuNvidiaDriverVersion" {
-  type        = string
-  description = "The NVIDIA driver version for GPU node group."
-  default     = "535.54.03"
-}
-
 variable "licenseServer" {
   type        = bool
   description = "Specifies whether a license server VM will be created."

From 82b08618bf37a66656e025c1d9b5898d969ff2f8 Mon Sep 17 00:00:00 2001
From: Christian Bergen <cbergen@dspace.de>
Date: Mon, 2 Dec 2024 15:37:38 +0100
Subject: [PATCH 03/29] add gpu operator as eks-addon

---
 k8s-eks-addons.tf                      |  1 +
 modules/k8s_eks_addons/gpu-operator.tf | 23 ++++++++++++
 modules/k8s_eks_addons/variables.tf    | 10 ++++++
 variables.tf                           | 48 ++++++++++++++++++++++++++
 4 files changed, 82 insertions(+)
 create mode 100644 modules/k8s_eks_addons/gpu-operator.tf

diff --git a/k8s-eks-addons.tf b/k8s-eks-addons.tf
index 0f77f4c..3c06ff0 100644
--- a/k8s-eks-addons.tf
+++ b/k8s-eks-addons.tf
@@ -6,6 +6,7 @@ module "k8s_eks_addons" {
   coredns_config                      = var.coredns_config
   s3_csi_config                       = var.s3_csi_config
   aws_load_balancer_controller_config = var.aws_load_balancer_controller_config
+  gpu_operator_config                 = var.gpu_operator_config
 
   addon_context = {
     aws_caller_identity_account_id = data.aws_caller_identity.current.account_id
diff --git a/modules/k8s_eks_addons/gpu-operator.tf b/modules/k8s_eks_addons/gpu-operator.tf
new file mode 100644
index 0000000..79ad623
--- /dev/null
+++ b/modules/k8s_eks_addons/gpu-operator.tf
@@ -0,0 +1,23 @@
+resource "kubernetes_namespace_v1" "gpu_operator" {
+  count = var.gpu_operator_config.enable ? 1 : 0
+
+  metadata {
+    name = "gpu-operator"
+  }
+}
+
+resource "helm_release" "gpu_operator" {
+  count = var.gpu_operator_config.enable ? 1 : 0
+
+  namespace         = kubernetes_namespace_v1.gpu_operator[0].metadata[0].name
+  name              = "gpu-operator"
+  chart             = "gpu-operator"
+  repository        = var.gpu_operator_config.helm_repository
+  version           = var.gpu_operator_config.helm_version
+  description       = "The GPU operator HelmChart deployment configuration"
+  dependency_update = true
+  values = [
+    var.gpu_operator_config.chart_values
+  ]
+  timeout = 1200
+}
diff --git a/modules/k8s_eks_addons/variables.tf b/modules/k8s_eks_addons/variables.tf
index 39ced2a..21498d5 100644
--- a/modules/k8s_eks_addons/variables.tf
+++ b/modules/k8s_eks_addons/variables.tf
@@ -49,3 +49,13 @@ variable "s3_csi_config" {
   })
   description = "Input configuration for AWS EKS add-on aws-mountpoint-s3-csi-driver."
 }
+
+variable "gpu_operator_config" {
+  description = "GPU operator configuration"
+  type = object({
+    enable          = bool
+    helm_repository = string
+    helm_version    = string
+    chart_values    = string
+  })
+}
diff --git a/variables.tf b/variables.tf
index 05e191b..7acfa10 100644
--- a/variables.tf
+++ b/variables.tf
@@ -369,3 +369,51 @@ variable "aws_load_balancer_controller_config" {
     enable = false
   }
 }
+
+variable "gpu_operator_config" {
+  type = object({
+    enable          = optional(bool, true)
+    helm_repository = optional(string, "https://helm.ngc.nvidia.com/nvidia")
+    helm_version    = optional(string, "v24.9.0")
+    chart_values = optional(string, <<-YAML
+operator:
+  defaultRuntime: containerd
+
+dcgmExporter:
+  enabled: false
+
+driver:
+  enabled: true
+  version: "550.90.07"
+
+validator:
+  driver:
+    env:
+    - name: DISABLE_DEV_CHAR_SYMLINK_CREATION
+      value: "true"
+
+toolkit:
+  enabled: true
+
+daemonsets:
+  tolerations:
+  - key: purpose
+    value: gpu
+    operator: Equal
+    effect: NoSchedule
+
+node-feature-discovery:
+  worker:
+    tolerations:
+    - key: purpose
+      value: gpu
+      operator: Equal
+      effect: NoSchedule
+YAML
+    )
+  })
+  description = "Input configuration for GPU operator deployed with helm release. By setting key 'enable' to 'true', GPU operator will be deployed. 'helm_repository' is an URL for the repository of GPU operator helm chart, where 'helm_version' is its respective version of a chart. 'chart_values' is used for changing default values.yaml of an GPU operator chart."
+  default = {
+    enable = false
+  }
+}

From c6db00526bb7f7a34d6e384983b5feaaa922a942 Mon Sep 17 00:00:00 2001
From: Christian Bergen <cbergen@dspace.de>
Date: Mon, 2 Dec 2024 15:55:35 +0100
Subject: [PATCH 04/29] replace namespace creation resource

---
 modules/k8s_eks_addons/gpu-operator.tf | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/modules/k8s_eks_addons/gpu-operator.tf b/modules/k8s_eks_addons/gpu-operator.tf
index 79ad623..aa6a920 100644
--- a/modules/k8s_eks_addons/gpu-operator.tf
+++ b/modules/k8s_eks_addons/gpu-operator.tf
@@ -1,17 +1,10 @@
-resource "kubernetes_namespace_v1" "gpu_operator" {
-  count = var.gpu_operator_config.enable ? 1 : 0
-
-  metadata {
-    name = "gpu-operator"
-  }
-}
-
 resource "helm_release" "gpu_operator" {
   count = var.gpu_operator_config.enable ? 1 : 0
 
   namespace         = kubernetes_namespace_v1.gpu_operator[0].metadata[0].name
   name              = "gpu-operator"
   chart             = "gpu-operator"
+  create_namespace  = true
   repository        = var.gpu_operator_config.helm_repository
   version           = var.gpu_operator_config.helm_version
   description       = "The GPU operator HelmChart deployment configuration"

From 61b20159f4058e8cac42206af54c751bb05626cd Mon Sep 17 00:00:00 2001
From: Christian Bergen <cbergen@dspace.de>
Date: Mon, 2 Dec 2024 15:58:12 +0100
Subject: [PATCH 05/29] hardcode namespace

---
 modules/k8s_eks_addons/gpu-operator.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/k8s_eks_addons/gpu-operator.tf b/modules/k8s_eks_addons/gpu-operator.tf
index aa6a920..6578ac8 100644
--- a/modules/k8s_eks_addons/gpu-operator.tf
+++ b/modules/k8s_eks_addons/gpu-operator.tf
@@ -1,7 +1,7 @@
 resource "helm_release" "gpu_operator" {
   count = var.gpu_operator_config.enable ? 1 : 0
 
-  namespace         = kubernetes_namespace_v1.gpu_operator[0].metadata[0].name
+  namespace         = "gpu-operator"
   name              = "gpu-operator"
   chart             = "gpu-operator"
   create_namespace  = true

From 57f179d8a7cf7f1cf1dbadd516b976d1ea91ed0a Mon Sep 17 00:00:00 2001
From: Christian Bergen <cbergen@dspace.de>
Date: Tue, 3 Dec 2024 08:30:16 +0100
Subject: [PATCH 06/29] add driver version and remove default chart values

---
 modules/k8s_eks_addons/gpu-operator.tf |  5 ++++
 variables.tf                           | 36 +++-----------------------
 2 files changed, 8 insertions(+), 33 deletions(-)

diff --git a/modules/k8s_eks_addons/gpu-operator.tf b/modules/k8s_eks_addons/gpu-operator.tf
index 6578ac8..ceb8b2f 100644
--- a/modules/k8s_eks_addons/gpu-operator.tf
+++ b/modules/k8s_eks_addons/gpu-operator.tf
@@ -13,4 +13,9 @@ resource "helm_release" "gpu_operator" {
     var.gpu_operator_config.chart_values
   ]
   timeout = 1200
+  set {
+    name  = "driver.version"
+    value = var.gpu_operator_config.driver_version
+  }
+
 }
diff --git a/variables.tf b/variables.tf
index 7acfa10..40e79d6 100644
--- a/variables.tf
+++ b/variables.tf
@@ -375,40 +375,9 @@ variable "gpu_operator_config" {
     enable          = optional(bool, true)
     helm_repository = optional(string, "https://helm.ngc.nvidia.com/nvidia")
     helm_version    = optional(string, "v24.9.0")
+    driver_version  = string
     chart_values = optional(string, <<-YAML
-operator:
-  defaultRuntime: containerd
-
-dcgmExporter:
-  enabled: false
-
-driver:
-  enabled: true
-  version: "550.90.07"
-
-validator:
-  driver:
-    env:
-    - name: DISABLE_DEV_CHAR_SYMLINK_CREATION
-      value: "true"
-
-toolkit:
-  enabled: true
-
-daemonsets:
-  tolerations:
-  - key: purpose
-    value: gpu
-    operator: Equal
-    effect: NoSchedule
-
-node-feature-discovery:
-  worker:
-    tolerations:
-    - key: purpose
-      value: gpu
-      operator: Equal
-      effect: NoSchedule
+
 YAML
     )
   })
@@ -417,3 +386,4 @@ YAML
     enable = false
   }
 }
+

From e775b2b741c081091c949623f9e95701ff26ace8 Mon Sep 17 00:00:00 2001
From: Christian Bergen <cbergen@dspace.de>
Date: Tue, 3 Dec 2024 08:46:27 +0100
Subject: [PATCH 07/29] remove default

---
 variables.tf | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/variables.tf b/variables.tf
index 40e79d6..865cf89 100644
--- a/variables.tf
+++ b/variables.tf
@@ -382,8 +382,5 @@ YAML
     )
   })
   description = "Input configuration for GPU operator deployed with helm release. By setting key 'enable' to 'true', GPU operator will be deployed. 'helm_repository' is an URL for the repository of GPU operator helm chart, where 'helm_version' is its respective version of a chart. 'chart_values' is used for changing default values.yaml of an GPU operator chart."
-  default = {
-    enable = false
-  }
 }
 

From 901af33b79f06c02404cbc2008ae19b1ed2e7572 Mon Sep 17 00:00:00 2001
From: Christian Bergen <cbergen@dspace.de>
Date: Tue, 3 Dec 2024 08:57:29 +0100
Subject: [PATCH 08/29] add driver_version to variables.tf of the eks-addons

---
 modules/k8s_eks_addons/variables.tf | 1 +
 1 file changed, 1 insertion(+)

diff --git a/modules/k8s_eks_addons/variables.tf b/modules/k8s_eks_addons/variables.tf
index 21498d5..81003e2 100644
--- a/modules/k8s_eks_addons/variables.tf
+++ b/modules/k8s_eks_addons/variables.tf
@@ -57,5 +57,6 @@ variable "gpu_operator_config" {
     helm_repository = string
     helm_version    = string
     chart_values    = string
+    driver_version  = string
   })
 }

From 1c13fb48a632ef2fecf51c3bdcf5dce9faf43b57 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Tue, 3 Dec 2024 14:51:13 +0000
Subject: [PATCH 09/29] terraform-docs: automated action

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 625ac46..ccdbda3 100644
--- a/README.md
+++ b/README.md
@@ -554,7 +554,7 @@ Encryption is enabled at all AWS resources that are created by Terraform:
 | <a name="input_gpuNodeDiskSize"></a> [gpuNodeDiskSize](#input\_gpuNodeDiskSize) | The disk size in GiB of the nodes for the gpu job execution | `number` | `100` | no |
 | <a name="input_gpuNodePool"></a> [gpuNodePool](#input\_gpuNodePool) | Specifies whether an additional node pool for gpu job execution is added to the kubernetes cluster | `bool` | `false` | no |
 | <a name="input_gpuNodeSize"></a> [gpuNodeSize](#input\_gpuNodeSize) | The machine size of the nodes for the gpu job execution | `list(string)` | <pre>[<br>  "g5.2xlarge"<br>]</pre> | no |
-| <a name="input_gpuNvidiaDriverVersion"></a> [gpuNvidiaDriverVersion](#input\_gpuNvidiaDriverVersion) | The NVIDIA driver version for GPU node group. | `string` | `"535.54.03"` | no |
+| <a name="input_gpu_operator_config"></a> [gpu\_operator\_config](#input\_gpu\_operator\_config) | Input configuration for GPU operator deployed with helm release. By setting key 'enable' to 'true', GPU operator will be deployed. 'helm\_repository' is an URL for the repository of GPU operator helm chart, where 'helm\_version' is its respective version of a chart. 'chart\_values' is used for changing default values.yaml of an GPU operator chart. | <pre>object({<br>    enable          = optional(bool, true)<br>    helm_repository = optional(string, "https://helm.ngc.nvidia.com/nvidia")<br>    helm_version    = optional(string, "v24.9.0")<br>    driver_version  = string<br>    chart_values = optional(string, <<-YAML<br><br>YAML<br>    )<br>  })</pre> | n/a | yes |
 | <a name="input_infrastructurename"></a> [infrastructurename](#input\_infrastructurename) | The name of the infrastructure. e.g. simphera-infra | `string` | `"simphera"` | no |
 | <a name="input_ingress_nginx_config"></a> [ingress\_nginx\_config](#input\_ingress\_nginx\_config) | Input configuration for ingress-nginx service deployed with helm release. By setting key 'enable' to 'true', ingress-nginx service will be deployed. 'helm\_repository' is an URL for the repository of ingress-nginx helm chart, where 'helm\_version' is its respective version of a chart. 'chart\_values' is used for changing default values.yaml of an ingress-nginx chart. | <pre>object({<br>    enable          = bool<br>    helm_repository = optional(string, "https://kubernetes.github.io/ingress-nginx")<br>    helm_version    = optional(string, "4.1.4")<br>    chart_values = optional(string, <<-YAML<br>controller:<br>  images:<br>    registry: "registry.k8s.io"<br>  service:<br>    annotations:<br>      service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing<br>YAML<br>    )<br>  })</pre> | <pre>{<br>  "enable": false<br>}</pre> | no |
 | <a name="input_install_schedule"></a> [install\_schedule](#input\_install\_schedule) | 6-field Cron expression describing the install maintenance schedule. Must not overlap with variable scan\_schedule. | `string` | `"cron(0 3 * * ? *)"` | no |

From a1419824cd55e65276f480a8ddcc5ae622f92d2a Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Tue, 3 Dec 2024 14:51:15 +0000
Subject: [PATCH 10/29] terraform-docs: automated action

---
 terraform.tfvars.example | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/terraform.tfvars.example b/terraform.tfvars.example
index 865b793..b3d3909 100644
--- a/terraform.tfvars.example
+++ b/terraform.tfvars.example
@@ -61,8 +61,11 @@ gpuNodeSize = [
   "g5.2xlarge"
 ]
 
-# The NVIDIA driver version for GPU node group.
-gpuNvidiaDriverVersion = "535.54.03"
+# Input configuration for GPU operator deployed with helm release.
+# By setting key 'enable' to 'true', GPU operator will be deployed.
+# 'helm_repository' is an URL for the repository of GPU operator helm chart, where 'helm_version' is its respective version of a chart.
+# 'chart_values' is used for changing default values.yaml of an GPU operator chart.
+gpu_operator_config = 
 
 # The name of the infrastructure. e.g. simphera-infra
 infrastructurename = "simphera"

From b8ac7e3ac676db3286ecbc3ec6fd49acdfa9c8be Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Tue, 3 Dec 2024 14:51:16 +0000
Subject: [PATCH 11/29] terraform-docs: automated action

---
 terraform.json.example | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/terraform.json.example b/terraform.json.example
index 1c48f79..ba19074 100644
--- a/terraform.json.example
+++ b/terraform.json.example
@@ -21,7 +21,7 @@
   "gpuNodeSize": [
     "g5.2xlarge"
   ],
-  "gpuNvidiaDriverVersion": "535.54.03",
+  "gpu_operator_config": null,
   "infrastructurename": "simphera",
   "ingress_nginx_config": {
     "enable": false

From 74a723d738e767119f57e119676fa413cb317233 Mon Sep 17 00:00:00 2001
From: Christian Bergen <cbergen@dspace.de>
Date: Tue, 3 Dec 2024 15:57:52 +0100
Subject: [PATCH 12/29] add to description

---
 variables.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/variables.tf b/variables.tf
index 865cf89..87669ce 100644
--- a/variables.tf
+++ b/variables.tf
@@ -381,6 +381,6 @@ variable "gpu_operator_config" {
 YAML
     )
   })
-  description = "Input configuration for GPU operator deployed with helm release. By setting key 'enable' to 'true', GPU operator will be deployed. 'helm_repository' is an URL for the repository of GPU operator helm chart, where 'helm_version' is its respective version of a chart. 'chart_values' is used for changing default values.yaml of an GPU operator chart."
+  description = "Input configuration for the GPU operator chart deployed with helm release. By setting key 'enable' to 'true', GPU operator will be deployed. 'helm_repository' is an URL for the repository of the GPU operator helm chart, where 'helm_version' is its respective version of a chart. 'chart_values' is used for changing default values.yaml of the GPU operator chart."
 }
 

From a7b235011358fa37f7af387042b81ecd1ca3d3ad Mon Sep 17 00:00:00 2001
From: Christian Bergen <cbergen@dspace.de>
Date: Tue, 3 Dec 2024 15:58:33 +0100
Subject: [PATCH 13/29] remove empty line

---
 variables.tf | 1 -
 1 file changed, 1 deletion(-)

diff --git a/variables.tf b/variables.tf
index 87669ce..c33e8b8 100644
--- a/variables.tf
+++ b/variables.tf
@@ -383,4 +383,3 @@ YAML
   })
   description = "Input configuration for the GPU operator chart deployed with helm release. By setting key 'enable' to 'true', GPU operator will be deployed. 'helm_repository' is an URL for the repository of the GPU operator helm chart, where 'helm_version' is its respective version of a chart. 'chart_values' is used for changing default values.yaml of the GPU operator chart."
 }
-

From 15ca2bbe5575a48fee8952c74169365b6c02d122 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Tue, 3 Dec 2024 14:59:37 +0000
Subject: [PATCH 14/29] terraform-docs: automated action

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ccdbda3..903cb55 100644
--- a/README.md
+++ b/README.md
@@ -554,7 +554,7 @@ Encryption is enabled at all AWS resources that are created by Terraform:
 | <a name="input_gpuNodeDiskSize"></a> [gpuNodeDiskSize](#input\_gpuNodeDiskSize) | The disk size in GiB of the nodes for the gpu job execution | `number` | `100` | no |
 | <a name="input_gpuNodePool"></a> [gpuNodePool](#input\_gpuNodePool) | Specifies whether an additional node pool for gpu job execution is added to the kubernetes cluster | `bool` | `false` | no |
 | <a name="input_gpuNodeSize"></a> [gpuNodeSize](#input\_gpuNodeSize) | The machine size of the nodes for the gpu job execution | `list(string)` | <pre>[<br>  "g5.2xlarge"<br>]</pre> | no |
-| <a name="input_gpu_operator_config"></a> [gpu\_operator\_config](#input\_gpu\_operator\_config) | Input configuration for GPU operator deployed with helm release. By setting key 'enable' to 'true', GPU operator will be deployed. 'helm\_repository' is an URL for the repository of GPU operator helm chart, where 'helm\_version' is its respective version of a chart. 'chart\_values' is used for changing default values.yaml of an GPU operator chart. | <pre>object({<br>    enable          = optional(bool, true)<br>    helm_repository = optional(string, "https://helm.ngc.nvidia.com/nvidia")<br>    helm_version    = optional(string, "v24.9.0")<br>    driver_version  = string<br>    chart_values = optional(string, <<-YAML<br><br>YAML<br>    )<br>  })</pre> | n/a | yes |
+| <a name="input_gpu_operator_config"></a> [gpu\_operator\_config](#input\_gpu\_operator\_config) | Input configuration for the GPU operator chart deployed with helm release. By setting key 'enable' to 'true', GPU operator will be deployed. 'helm\_repository' is an URL for the repository of the GPU operator helm chart, where 'helm\_version' is its respective version of a chart. 'chart\_values' is used for changing default values.yaml of the GPU operator chart. | <pre>object({<br>    enable          = optional(bool, true)<br>    helm_repository = optional(string, "https://helm.ngc.nvidia.com/nvidia")<br>    helm_version    = optional(string, "v24.9.0")<br>    driver_version  = string<br>    chart_values = optional(string, <<-YAML<br><br>YAML<br>    )<br>  })</pre> | n/a | yes |
 | <a name="input_infrastructurename"></a> [infrastructurename](#input\_infrastructurename) | The name of the infrastructure. e.g. simphera-infra | `string` | `"simphera"` | no |
 | <a name="input_ingress_nginx_config"></a> [ingress\_nginx\_config](#input\_ingress\_nginx\_config) | Input configuration for ingress-nginx service deployed with helm release. By setting key 'enable' to 'true', ingress-nginx service will be deployed. 'helm\_repository' is an URL for the repository of ingress-nginx helm chart, where 'helm\_version' is its respective version of a chart. 'chart\_values' is used for changing default values.yaml of an ingress-nginx chart. | <pre>object({<br>    enable          = bool<br>    helm_repository = optional(string, "https://kubernetes.github.io/ingress-nginx")<br>    helm_version    = optional(string, "4.1.4")<br>    chart_values = optional(string, <<-YAML<br>controller:<br>  images:<br>    registry: "registry.k8s.io"<br>  service:<br>    annotations:<br>      service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing<br>YAML<br>    )<br>  })</pre> | <pre>{<br>  "enable": false<br>}</pre> | no |
 | <a name="input_install_schedule"></a> [install\_schedule](#input\_install\_schedule) | 6-field Cron expression describing the install maintenance schedule. Must not overlap with variable scan\_schedule. | `string` | `"cron(0 3 * * ? *)"` | no |

From ab5ae14817d828c692b5f64753b1455e56549270 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Tue, 3 Dec 2024 14:59:39 +0000
Subject: [PATCH 15/29] terraform-docs: automated action

---
 terraform.tfvars.example | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/terraform.tfvars.example b/terraform.tfvars.example
index b3d3909..6f8f4d1 100644
--- a/terraform.tfvars.example
+++ b/terraform.tfvars.example
@@ -61,10 +61,10 @@ gpuNodeSize = [
   "g5.2xlarge"
 ]
 
-# Input configuration for GPU operator deployed with helm release.
+# Input configuration for the GPU operator chart deployed with helm release.
 # By setting key 'enable' to 'true', GPU operator will be deployed.
-# 'helm_repository' is an URL for the repository of GPU operator helm chart, where 'helm_version' is its respective version of a chart.
-# 'chart_values' is used for changing default values.yaml of an GPU operator chart.
+# 'helm_repository' is an URL for the repository of the GPU operator helm chart, where 'helm_version' is its respective version of a chart.
+# 'chart_values' is used for changing default values.yaml of the GPU operator chart.
 gpu_operator_config = 
 
 # The name of the infrastructure. e.g. simphera-infra

From 2c5274a949b6348794153b20c7bcbaea40cbc0c3 Mon Sep 17 00:00:00 2001
From: Christian Bergen <cbergen@dspace.de>
Date: Tue, 3 Dec 2024 16:46:55 +0100
Subject: [PATCH 16/29] add default gpu operator helm values

---
 variables.tf | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/variables.tf b/variables.tf
index c33e8b8..0197e68 100644
--- a/variables.tf
+++ b/variables.tf
@@ -375,9 +375,40 @@ variable "gpu_operator_config" {
     enable          = optional(bool, true)
     helm_repository = optional(string, "https://helm.ngc.nvidia.com/nvidia")
     helm_version    = optional(string, "v24.9.0")
-    driver_version  = string
+    driver_version  = optional(string, "550.90.07")
     chart_values = optional(string, <<-YAML
-
+operator:
+  defaultRuntime: containerd
+
+dcgmExporter:
+  enabled: false
+
+driver:
+  enabled: true
+
+validator:
+  driver:
+    env:
+    - name: DISABLE_DEV_CHAR_SYMLINK_CREATION
+      value: "true"
+
+toolkit:
+  enabled: true
+
+daemonsets:
+  tolerations:
+  - key: purpose
+    value: gpu
+    operator: Equal
+    effect: NoSchedule
+
+node-feature-discovery:
+  worker:
+    tolerations:
+    - key: purpose
+      value: gpu
+      operator: Equal
+      effect: NoSchedule
 YAML
     )
   })

From f56b3c145fcbe4cd2293e2cf795808081bb72e81 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Tue, 3 Dec 2024 15:47:59 +0000
Subject: [PATCH 17/29] terraform-docs: automated action

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 903cb55..7518889 100644
--- a/README.md
+++ b/README.md
@@ -554,7 +554,7 @@ Encryption is enabled at all AWS resources that are created by Terraform:
 | <a name="input_gpuNodeDiskSize"></a> [gpuNodeDiskSize](#input\_gpuNodeDiskSize) | The disk size in GiB of the nodes for the gpu job execution | `number` | `100` | no |
 | <a name="input_gpuNodePool"></a> [gpuNodePool](#input\_gpuNodePool) | Specifies whether an additional node pool for gpu job execution is added to the kubernetes cluster | `bool` | `false` | no |
 | <a name="input_gpuNodeSize"></a> [gpuNodeSize](#input\_gpuNodeSize) | The machine size of the nodes for the gpu job execution | `list(string)` | <pre>[<br>  "g5.2xlarge"<br>]</pre> | no |
-| <a name="input_gpu_operator_config"></a> [gpu\_operator\_config](#input\_gpu\_operator\_config) | Input configuration for the GPU operator chart deployed with helm release. By setting key 'enable' to 'true', GPU operator will be deployed. 'helm\_repository' is an URL for the repository of the GPU operator helm chart, where 'helm\_version' is its respective version of a chart. 'chart\_values' is used for changing default values.yaml of the GPU operator chart. | <pre>object({<br>    enable          = optional(bool, true)<br>    helm_repository = optional(string, "https://helm.ngc.nvidia.com/nvidia")<br>    helm_version    = optional(string, "v24.9.0")<br>    driver_version  = string<br>    chart_values = optional(string, <<-YAML<br><br>YAML<br>    )<br>  })</pre> | n/a | yes |
+| <a name="input_gpu_operator_config"></a> [gpu\_operator\_config](#input\_gpu\_operator\_config) | Input configuration for the GPU operator chart deployed with helm release. By setting key 'enable' to 'true', GPU operator will be deployed. 'helm\_repository' is an URL for the repository of the GPU operator helm chart, where 'helm\_version' is its respective version of a chart. 'chart\_values' is used for changing default values.yaml of the GPU operator chart. | <pre>object({<br>    enable          = optional(bool, true)<br>    helm_repository = optional(string, "https://helm.ngc.nvidia.com/nvidia")<br>    helm_version    = optional(string, "v24.9.0")<br>    driver_version  = optional(string, "550.90.07")<br>    chart_values = optional(string, <<-YAML<br>operator:<br>  defaultRuntime: containerd<br><br>dcgmExporter:<br>  enabled: false<br><br>driver:<br>  enabled: true<br><br>validator:<br>  driver:<br>    env:<br>    - name: DISABLE_DEV_CHAR_SYMLINK_CREATION<br>      value: "true"<br><br>toolkit:<br>  enabled: true<br><br>daemonsets:<br>  tolerations:<br>  - key: purpose<br>    value: gpu<br>    operator: Equal<br>    effect: NoSchedule<br><br>node-feature-discovery:<br>  worker:<br>    tolerations:<br>    - key: purpose<br>      value: gpu<br>      operator: Equal<br>      effect: NoSchedule<br>YAML<br>    )<br>  })</pre> | n/a | yes |
 | <a name="input_infrastructurename"></a> [infrastructurename](#input\_infrastructurename) | The name of the infrastructure. e.g. simphera-infra | `string` | `"simphera"` | no |
 | <a name="input_ingress_nginx_config"></a> [ingress\_nginx\_config](#input\_ingress\_nginx\_config) | Input configuration for ingress-nginx service deployed with helm release. By setting key 'enable' to 'true', ingress-nginx service will be deployed. 'helm\_repository' is an URL for the repository of ingress-nginx helm chart, where 'helm\_version' is its respective version of a chart. 'chart\_values' is used for changing default values.yaml of an ingress-nginx chart. | <pre>object({<br>    enable          = bool<br>    helm_repository = optional(string, "https://kubernetes.github.io/ingress-nginx")<br>    helm_version    = optional(string, "4.1.4")<br>    chart_values = optional(string, <<-YAML<br>controller:<br>  images:<br>    registry: "registry.k8s.io"<br>  service:<br>    annotations:<br>      service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing<br>YAML<br>    )<br>  })</pre> | <pre>{<br>  "enable": false<br>}</pre> | no |
 | <a name="input_install_schedule"></a> [install\_schedule](#input\_install\_schedule) | 6-field Cron expression describing the install maintenance schedule. Must not overlap with variable scan\_schedule. | `string` | `"cron(0 3 * * ? *)"` | no |

From af1916c0cef3fde1dd61902b5fd0a78ad6eda579 Mon Sep 17 00:00:00 2001
From: Christian Bergen <cbergen@dspace.de>
Date: Thu, 5 Dec 2024 11:48:28 +0100
Subject: [PATCH 18/29] remove launch template creation

---
 locals.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/locals.tf b/locals.tf
index 3929415..773e656 100644
--- a/locals.tf
+++ b/locals.tf
@@ -72,7 +72,7 @@ locals {
       min_size               = var.gpuNodeCountMin
       disk_size              = var.gpuNodeDiskSize
       custom_ami_id          = data.aws_ami.al2gpu_ami.image_id
-      create_launch_template = true
+      create_launch_template = false
       block_device_mappings = [{
         device_name           = "/dev/sda1"
         volume_type           = "gp2"

From 77525890e8521bc97f162ae929a92f22e4b25e73 Mon Sep 17 00:00:00 2001
From: Christian Bergen <cbergen@dspace.de>
Date: Thu, 5 Dec 2024 13:06:24 +0100
Subject: [PATCH 19/29] enable launch template

---
 locals.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/locals.tf b/locals.tf
index 773e656..3929415 100644
--- a/locals.tf
+++ b/locals.tf
@@ -72,7 +72,7 @@ locals {
       min_size               = var.gpuNodeCountMin
       disk_size              = var.gpuNodeDiskSize
       custom_ami_id          = data.aws_ami.al2gpu_ami.image_id
-      create_launch_template = false
+      create_launch_template = true
       block_device_mappings = [{
         device_name           = "/dev/sda1"
         volume_type           = "gp2"

From 498e43d9bd500f969b8cc076fa5b97776c884ea8 Mon Sep 17 00:00:00 2001
From: Christian Bergen <cbergen@dspace.de>
Date: Thu, 5 Dec 2024 14:00:45 +0100
Subject: [PATCH 20/29] try another way

---
 locals.tf | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/locals.tf b/locals.tf
index 3929415..df360e5 100644
--- a/locals.tf
+++ b/locals.tf
@@ -59,10 +59,7 @@ locals {
           "effect" = "NO_SCHEDULE"
         }
       ]
-    }
-  }
-
-  gpu_node_pool = {
+    },
     "gpuexecnodes" = {
       node_group_name        = "gpuexecnodes"
       instance_types         = var.gpuNodeSize
@@ -92,6 +89,10 @@ locals {
     }
   }
 
+  # gpu_node_pool = {
+
+  # }
+
   ivsgpu_node_pool = {
     "gpuivsnodes" = {
       node_group_name        = "gpuivsnodes"

From e4adfb94c390743f433d2d48750d6611029ae91b Mon Sep 17 00:00:00 2001
From: Christian Bergen <cbergen@dspace.de>
Date: Thu, 5 Dec 2024 14:15:51 +0100
Subject: [PATCH 21/29] test with false template

---
 locals.tf | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/locals.tf b/locals.tf
index df360e5..773e656 100644
--- a/locals.tf
+++ b/locals.tf
@@ -59,7 +59,10 @@ locals {
           "effect" = "NO_SCHEDULE"
         }
       ]
-    },
+    }
+  }
+
+  gpu_node_pool = {
     "gpuexecnodes" = {
       node_group_name        = "gpuexecnodes"
       instance_types         = var.gpuNodeSize
@@ -69,7 +72,7 @@ locals {
       min_size               = var.gpuNodeCountMin
       disk_size              = var.gpuNodeDiskSize
       custom_ami_id          = data.aws_ami.al2gpu_ami.image_id
-      create_launch_template = true
+      create_launch_template = false
       block_device_mappings = [{
         device_name           = "/dev/sda1"
         volume_type           = "gp2"
@@ -89,10 +92,6 @@ locals {
     }
   }
 
-  # gpu_node_pool = {
-
-  # }
-
   ivsgpu_node_pool = {
     "gpuivsnodes" = {
       node_group_name        = "gpuivsnodes"

From 8471a52b0edeee73e53e1e76fbf232f4859e642e Mon Sep 17 00:00:00 2001
From: Christian Bergen <cbergen@dspace.de>
Date: Thu, 5 Dec 2024 14:17:19 +0100
Subject: [PATCH 22/29] reset to true

---
 locals.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/locals.tf b/locals.tf
index 773e656..3929415 100644
--- a/locals.tf
+++ b/locals.tf
@@ -72,7 +72,7 @@ locals {
       min_size               = var.gpuNodeCountMin
       disk_size              = var.gpuNodeDiskSize
       custom_ami_id          = data.aws_ami.al2gpu_ami.image_id
-      create_launch_template = false
+      create_launch_template = true
       block_device_mappings = [{
         device_name           = "/dev/sda1"
         volume_type           = "gp2"

From 1b06f076a410e92b4b8b58ea182e6b22dad09bd5 Mon Sep 17 00:00:00 2001
From: Christian Bergen <cbergen@dspace.de>
Date: Fri, 6 Dec 2024 10:40:29 +0100
Subject: [PATCH 23/29] fix description

---
 terraform.tfvars.example | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/terraform.tfvars.example b/terraform.tfvars.example
index 6f8f4d1..7631570 100644
--- a/terraform.tfvars.example
+++ b/terraform.tfvars.example
@@ -65,7 +65,17 @@ gpuNodeSize = [
 # By setting key 'enable' to 'true', GPU operator will be deployed.
 # 'helm_repository' is an URL for the repository of the GPU operator helm chart, where 'helm_version' is its respective version of a chart.
 # 'chart_values' is used for changing default values.yaml of the GPU operator chart.
-gpu_operator_config = 
+gpu_operator_config = {
+  type = object({
+    enable          = optional(bool, true)
+    helm_repository = optional(string, "https://helm.ngc.nvidia.com/nvidia")
+    helm_version    = optional(string, "v24.9.0")
+    driver_version  = optional(string, "550.90.07")
+    chart_values = optional(string, <<-YAML
+YAML
+    )
+  })
+}
 
 # The name of the infrastructure. e.g. simphera-infra
 infrastructurename = "simphera"

From 7fad415e507cb9425a00d100cc57c106a7f7236f Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Fri, 6 Dec 2024 09:41:39 +0000
Subject: [PATCH 24/29] terraform-docs: automated action

---
 terraform.tfvars.example | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/terraform.tfvars.example b/terraform.tfvars.example
index 7631570..6f8f4d1 100644
--- a/terraform.tfvars.example
+++ b/terraform.tfvars.example
@@ -65,17 +65,7 @@ gpuNodeSize = [
 # By setting key 'enable' to 'true', GPU operator will be deployed.
 # 'helm_repository' is an URL for the repository of the GPU operator helm chart, where 'helm_version' is its respective version of a chart.
 # 'chart_values' is used for changing default values.yaml of the GPU operator chart.
-gpu_operator_config = {
-  type = object({
-    enable          = optional(bool, true)
-    helm_repository = optional(string, "https://helm.ngc.nvidia.com/nvidia")
-    helm_version    = optional(string, "v24.9.0")
-    driver_version  = optional(string, "550.90.07")
-    chart_values = optional(string, <<-YAML
-YAML
-    )
-  })
-}
+gpu_operator_config = 
 
 # The name of the infrastructure. e.g. simphera-infra
 infrastructurename = "simphera"

From b3eb332e65734d91883cdea49c004ebc3e6845d5 Mon Sep 17 00:00:00 2001
From: Christian Bergen <cbergen@dspace.de>
Date: Fri, 6 Dec 2024 10:42:56 +0100
Subject: [PATCH 25/29] try gp3 and change namespace

---
 locals.tf                              | 4 ++--
 modules/k8s_eks_addons/gpu-operator.tf | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/locals.tf b/locals.tf
index 3929415..908780c 100644
--- a/locals.tf
+++ b/locals.tf
@@ -75,7 +75,7 @@ locals {
       create_launch_template = true
       block_device_mappings = [{
         device_name           = "/dev/sda1"
-        volume_type           = "gp2"
+        volume_type           = "gp3"
         volume_size           = 128
         delete_on_termination = true
       }]
@@ -105,7 +105,7 @@ locals {
       create_launch_template = true
       block_device_mappings = [{
         device_name           = "/dev/sda1"
-        volume_type           = "gp2"
+        volume_type           = "gp3"
         volume_size           = 128
         delete_on_termination = true
       }]
diff --git a/modules/k8s_eks_addons/gpu-operator.tf b/modules/k8s_eks_addons/gpu-operator.tf
index ceb8b2f..a0a6f48 100644
--- a/modules/k8s_eks_addons/gpu-operator.tf
+++ b/modules/k8s_eks_addons/gpu-operator.tf
@@ -1,7 +1,7 @@
 resource "helm_release" "gpu_operator" {
   count = var.gpu_operator_config.enable ? 1 : 0
 
-  namespace         = "gpu-operator"
+  namespace         = "kube-system"
   name              = "gpu-operator"
   chart             = "gpu-operator"
   create_namespace  = true

From 34ef8d554b97be6f3e9e8914710f7cd9d5384545 Mon Sep 17 00:00:00 2001
From: Christian Bergen <cbergen@dspace.de>
Date: Fri, 6 Dec 2024 13:32:01 +0100
Subject: [PATCH 26/29] add default

---
 variables.tf | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/variables.tf b/variables.tf
index 0197e68..7120f61 100644
--- a/variables.tf
+++ b/variables.tf
@@ -413,4 +413,7 @@ YAML
     )
   })
   description = "Input configuration for the GPU operator chart deployed with helm release. By setting key 'enable' to 'true', GPU operator will be deployed. 'helm_repository' is an URL for the repository of the GPU operator helm chart, where 'helm_version' is its respective version of a chart. 'chart_values' is used for changing default values.yaml of the GPU operator chart."
+  default = {
+    enable = false
+  }
 }

From 40c3ad93a1d8f6429e80668d8d2e359d3dc6612a Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Fri, 6 Dec 2024 12:33:02 +0000
Subject: [PATCH 27/29] terraform-docs: automated action

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7518889..ea7cf5c 100644
--- a/README.md
+++ b/README.md
@@ -554,7 +554,7 @@ Encryption is enabled at all AWS resources that are created by Terraform:
 | <a name="input_gpuNodeDiskSize"></a> [gpuNodeDiskSize](#input\_gpuNodeDiskSize) | The disk size in GiB of the nodes for the gpu job execution | `number` | `100` | no |
 | <a name="input_gpuNodePool"></a> [gpuNodePool](#input\_gpuNodePool) | Specifies whether an additional node pool for gpu job execution is added to the kubernetes cluster | `bool` | `false` | no |
 | <a name="input_gpuNodeSize"></a> [gpuNodeSize](#input\_gpuNodeSize) | The machine size of the nodes for the gpu job execution | `list(string)` | <pre>[<br>  "g5.2xlarge"<br>]</pre> | no |
-| <a name="input_gpu_operator_config"></a> [gpu\_operator\_config](#input\_gpu\_operator\_config) | Input configuration for the GPU operator chart deployed with helm release. By setting key 'enable' to 'true', GPU operator will be deployed. 'helm\_repository' is an URL for the repository of the GPU operator helm chart, where 'helm\_version' is its respective version of a chart. 'chart\_values' is used for changing default values.yaml of the GPU operator chart. | <pre>object({<br>    enable          = optional(bool, true)<br>    helm_repository = optional(string, "https://helm.ngc.nvidia.com/nvidia")<br>    helm_version    = optional(string, "v24.9.0")<br>    driver_version  = optional(string, "550.90.07")<br>    chart_values = optional(string, <<-YAML<br>operator:<br>  defaultRuntime: containerd<br><br>dcgmExporter:<br>  enabled: false<br><br>driver:<br>  enabled: true<br><br>validator:<br>  driver:<br>    env:<br>    - name: DISABLE_DEV_CHAR_SYMLINK_CREATION<br>      value: "true"<br><br>toolkit:<br>  enabled: true<br><br>daemonsets:<br>  tolerations:<br>  - key: purpose<br>    value: gpu<br>    operator: Equal<br>    effect: NoSchedule<br><br>node-feature-discovery:<br>  worker:<br>    tolerations:<br>    - key: purpose<br>      value: gpu<br>      operator: Equal<br>      effect: NoSchedule<br>YAML<br>    )<br>  })</pre> | n/a | yes |
+| <a name="input_gpu_operator_config"></a> [gpu\_operator\_config](#input\_gpu\_operator\_config) | Input configuration for the GPU operator chart deployed with helm release. By setting key 'enable' to 'true', GPU operator will be deployed. 'helm\_repository' is an URL for the repository of the GPU operator helm chart, where 'helm\_version' is its respective version of a chart. 'chart\_values' is used for changing default values.yaml of the GPU operator chart. | <pre>object({<br>    enable          = optional(bool, true)<br>    helm_repository = optional(string, "https://helm.ngc.nvidia.com/nvidia")<br>    helm_version    = optional(string, "v24.9.0")<br>    driver_version  = optional(string, "550.90.07")<br>    chart_values = optional(string, <<-YAML<br>operator:<br>  defaultRuntime: containerd<br><br>dcgmExporter:<br>  enabled: false<br><br>driver:<br>  enabled: true<br><br>validator:<br>  driver:<br>    env:<br>    - name: DISABLE_DEV_CHAR_SYMLINK_CREATION<br>      value: "true"<br><br>toolkit:<br>  enabled: true<br><br>daemonsets:<br>  tolerations:<br>  - key: purpose<br>    value: gpu<br>    operator: Equal<br>    effect: NoSchedule<br><br>node-feature-discovery:<br>  worker:<br>    tolerations:<br>    - key: purpose<br>      value: gpu<br>      operator: Equal<br>      effect: NoSchedule<br>YAML<br>    )<br>  })</pre> | <pre>{<br>  "enable": false<br>}</pre> | no |
 | <a name="input_infrastructurename"></a> [infrastructurename](#input\_infrastructurename) | The name of the infrastructure. e.g. simphera-infra | `string` | `"simphera"` | no |
 | <a name="input_ingress_nginx_config"></a> [ingress\_nginx\_config](#input\_ingress\_nginx\_config) | Input configuration for ingress-nginx service deployed with helm release. By setting key 'enable' to 'true', ingress-nginx service will be deployed. 'helm\_repository' is an URL for the repository of ingress-nginx helm chart, where 'helm\_version' is its respective version of a chart. 'chart\_values' is used for changing default values.yaml of an ingress-nginx chart. | <pre>object({<br>    enable          = bool<br>    helm_repository = optional(string, "https://kubernetes.github.io/ingress-nginx")<br>    helm_version    = optional(string, "4.1.4")<br>    chart_values = optional(string, <<-YAML<br>controller:<br>  images:<br>    registry: "registry.k8s.io"<br>  service:<br>    annotations:<br>      service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing<br>YAML<br>    )<br>  })</pre> | <pre>{<br>  "enable": false<br>}</pre> | no |
 | <a name="input_install_schedule"></a> [install\_schedule](#input\_install\_schedule) | 6-field Cron expression describing the install maintenance schedule. Must not overlap with variable scan\_schedule. | `string` | `"cron(0 3 * * ? *)"` | no |

From 46c4d80608166093c04df8d21f11e2af1a901ea1 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Fri, 6 Dec 2024 12:33:04 +0000
Subject: [PATCH 28/29] terraform-docs: automated action

---
 terraform.tfvars.example | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/terraform.tfvars.example b/terraform.tfvars.example
index 6f8f4d1..87a57f5 100644
--- a/terraform.tfvars.example
+++ b/terraform.tfvars.example
@@ -65,7 +65,9 @@ gpuNodeSize = [
 # By setting key 'enable' to 'true', GPU operator will be deployed.
 # 'helm_repository' is an URL for the repository of the GPU operator helm chart, where 'helm_version' is its respective version of a chart.
 # 'chart_values' is used for changing default values.yaml of the GPU operator chart.
-gpu_operator_config = 
+gpu_operator_config = {
+  "enable": false
+}
 
 # The name of the infrastructure. e.g. simphera-infra
 infrastructurename = "simphera"

From 923b94844c3e0b8411e35e2ecf9260df916a585b Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Fri, 6 Dec 2024 12:33:05 +0000
Subject: [PATCH 29/29] terraform-docs: automated action

---
 terraform.json.example | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/terraform.json.example b/terraform.json.example
index ba19074..e7b13bd 100644
--- a/terraform.json.example
+++ b/terraform.json.example
@@ -21,7 +21,9 @@
   "gpuNodeSize": [
     "g5.2xlarge"
   ],
-  "gpu_operator_config": null,
+  "gpu_operator_config": {
+    "enable": false
+  },
   "infrastructurename": "simphera",
   "ingress_nginx_config": {
     "enable": false