Merge branch 'main' into eks/gpu_changes_main

dspace-group · Dec 12, 2024 · 6fc8294 · 6fc8294
2 parents 80f874b + 5f7fbe4
commit 6fc8294
Show file tree

Hide file tree

Showing 8 changed files with 105 additions and 14 deletions.
diff --git a/README.md b/README.md
@@ -554,7 +554,7 @@ Encryption is enabled at all AWS resources that are created by Terraform:
 | <a name="input_gpuNodeDiskSize"></a> [gpuNodeDiskSize](#input\_gpuNodeDiskSize) | The disk size in GiB of the nodes for the gpu job execution | `number` | `100` | no |
 | <a name="input_gpuNodePool"></a> [gpuNodePool](#input\_gpuNodePool) | Specifies whether an additional node pool for gpu job execution is added to the kubernetes cluster | `bool` | `false` | no |
 | <a name="input_gpuNodeSize"></a> [gpuNodeSize](#input\_gpuNodeSize) | The machine size of the nodes for the gpu job execution | `list(string)` | <pre>[<br>  "g5.2xlarge"<br>]</pre> | no |
-| <a name="input_gpuNvidiaDriverVersion"></a> [gpuNvidiaDriverVersion](#input\_gpuNvidiaDriverVersion) | The NVIDIA driver version for GPU node group. | `string` | `"535.54.03"` | no |
+| <a name="input_gpu_operator_config"></a> [gpu\_operator\_config](#input\_gpu\_operator\_config) | Input configuration for the GPU operator chart deployed with helm release. By setting key 'enable' to 'true', GPU operator will be deployed. 'helm\_repository' is an URL for the repository of the GPU operator helm chart, where 'helm\_version' is its respective version of a chart. 'chart\_values' is used for changing default values.yaml of the GPU operator chart. | <pre>object({<br>    enable          = optional(bool, true)<br>    helm_repository = optional(string, "https://helm.ngc.nvidia.com/nvidia")<br>    helm_version    = optional(string, "v24.9.0")<br>    driver_version  = optional(string, "550.90.07")<br>    chart_values = optional(string, <<-YAML<br>operator:<br>  defaultRuntime: containerd<br><br>dcgmExporter:<br>  enabled: false<br><br>driver:<br>  enabled: true<br><br>validator:<br>  driver:<br>    env:<br>    - name: DISABLE_DEV_CHAR_SYMLINK_CREATION<br>      value: "true"<br><br>toolkit:<br>  enabled: true<br><br>daemonsets:<br>  tolerations:<br>  - key: purpose<br>    value: gpu<br>    operator: Equal<br>    effect: NoSchedule<br><br>node-feature-discovery:<br>  worker:<br>    tolerations:<br>    - key: purpose<br>      value: gpu<br>      operator: Equal<br>      effect: NoSchedule<br>YAML<br>    )<br>  })</pre> | <pre>{<br>  "enable": false<br>}</pre> | no |
 | <a name="input_infrastructurename"></a> [infrastructurename](#input\_infrastructurename) | The name of the infrastructure. e.g. simphera-infra | `string` | `"simphera"` | no |
 | <a name="input_ingress_nginx_config"></a> [ingress\_nginx\_config](#input\_ingress\_nginx\_config) | Input configuration for ingress-nginx service deployed with helm release. By setting key 'enable' to 'true', ingress-nginx service will be deployed. 'helm\_repository' is an URL for the repository of ingress-nginx helm chart, where 'helm\_version' is its respective version of a chart. 'chart\_values' is used for changing default values.yaml of an ingress-nginx chart. | <pre>object({<br>    enable          = bool<br>    helm_repository = optional(string, "https://kubernetes.github.io/ingress-nginx")<br>    helm_version    = optional(string, "4.1.4")<br>    chart_values = optional(string, <<-YAML<br>controller:<br>  images:<br>    registry: "registry.k8s.io"<br>  service:<br>    annotations:<br>      service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing<br>YAML<br>    )<br>  })</pre> | <pre>{<br>  "enable": false<br>}</pre> | no |
 | <a name="input_install_schedule"></a> [install\_schedule](#input\_install\_schedule) | 6-field Cron expression describing the install maintenance schedule. Must not overlap with variable scan\_schedule. | `string` | `"cron(0 3 * * ? *)"` | no |

diff --git a/k8s-eks-addons.tf b/k8s-eks-addons.tf
@@ -6,6 +6,7 @@
 # coredns_config                      = var.coredns_config
 # s3_csi_config                       = var.s3_csi_config
 # aws_load_balancer_controller_config = var.aws_load_balancer_controller_config
+# gpu_operator_config                 = var.gpu_operator_config
 
 #   addon_context = {
 #     aws_caller_identity_account_id = data.aws_caller_identity.current.account_id

diff --git a/locals.tf b/locals.tf
@@ -3,7 +3,7 @@ data "aws_ami" "al2gpu_ami" {
   most_recent = true
   filter {
     name   = "name"
-    values = ["*amazon-eks-gpu-node-${var.kubernetesVersion}*"]
+    values = ["*ubuntu-eks/k8s_${var.kubernetesVersion}/images/hvm-ssd/ubuntu-jammy-22.04-amd64-server*"]
   }
 }
 
@@ -30,7 +30,6 @@ locals {
   private_subnets                           = local.create_vpc ? module.vpc[0].private_subnets : (local.use_private_subnets_ids ? var.private_subnet_ids : [for s in data.aws_subnet.private_subnet : s.id])
   public_subnets                            = local.create_vpc ? module.vpc[0].public_subnets : (local.use_public_subnet_ids ? var.public_subnet_ids : [for s in data.aws_subnet.public_subnet : s.id])
   # Using a one-line command for gpuPostUserData to avoid issues due to different line endings between Windows and Linux.
-  gpuPostUserData = "sudo yum -y erase nvidia-driver \nsudo yum -y install make gcc \nsudo yum -y update \nsudo yum -y install gcc kernel-devel-$(uname -r) \nsudo curl -fSsl -O https://us.download.nvidia.com/tesla/${var.gpuNvidiaDriverVersion}/NVIDIA-Linux-x86_64-${var.gpuNvidiaDriverVersion}.run \nsudo chmod +x NVIDIA-Linux-x86_64*.run \nsudo CC=/usr/bin/gcc10-cc ./NVIDIA-Linux-x86_64*.run -s --no-dkms --install-libglvnd \nsudo touch /etc/modprobe.d/nvidia.conf \necho \"options nvidia NVreg_EnableGpuFirmware=0\" | sudo tee --append /etc/modprobe.d/nvidia.conf \nsudo reboot"
 
   default_managed_node_pools = {
     "default" = {
@@ -74,7 +73,12 @@ locals {
       disk_size              = var.gpuNodeDiskSize
       custom_ami_id          = data.aws_ami.al2gpu_ami.image_id
       create_launch_template = true
-      post_userdata          = local.gpuPostUserData
+      block_device_mappings = [{
+        device_name           = "/dev/sda1"
+        volume_type           = "gp3"
+        volume_size           = 128
+        delete_on_termination = true
+      }]
       k8s_labels = {
         "purpose" = "gpu"
       }
@@ -99,7 +103,12 @@ locals {
       disk_size              = var.ivsGpuNodeDiskSize
       custom_ami_id          = data.aws_ami.al2gpu_ami.image_id
       create_launch_template = true
-      post_userdata          = local.gpuPostUserData
+      block_device_mappings = [{
+        device_name           = "/dev/sda1"
+        volume_type           = "gp3"
+        volume_size           = 128
+        delete_on_termination = true
+      }]
       k8s_labels = {
         "product" = "ivs",
         "purpose" = "gpu"

diff --git a/modules/k8s_eks_addons/gpu-operator.tf b/modules/k8s_eks_addons/gpu-operator.tf
@@ -0,0 +1,21 @@
+resource "helm_release" "gpu_operator" {
+  count = var.gpu_operator_config.enable ? 1 : 0
+
+  namespace         = "kube-system"
+  name              = "gpu-operator"
+  chart             = "gpu-operator"
+  create_namespace  = true
+  repository        = var.gpu_operator_config.helm_repository
+  version           = var.gpu_operator_config.helm_version
+  description       = "The GPU operator HelmChart deployment configuration"
+  dependency_update = true
+  values = [
+    var.gpu_operator_config.chart_values
+  ]
+  timeout = 1200
+  set {
+    name  = "driver.version"
+    value = var.gpu_operator_config.driver_version
+  }
+
+}
diff --git a/modules/k8s_eks_addons/variables.tf b/modules/k8s_eks_addons/variables.tf
@@ -49,3 +49,14 @@ variable "s3_csi_config" {
   })
   description = "Input configuration for AWS EKS add-on aws-mountpoint-s3-csi-driver."
 }
+
+variable "gpu_operator_config" {
+  description = "GPU operator configuration"
+  type = object({
+    enable          = bool
+    helm_repository = string
+    helm_version    = string
+    chart_values    = string
+    driver_version  = string
+  })
+}
diff --git a/terraform.json.example b/terraform.json.example
@@ -21,7 +21,9 @@
   "gpuNodeSize": [
     "g5.2xlarge"
   ],
-  "gpuNvidiaDriverVersion": "535.54.03",
+  "gpu_operator_config": {
+    "enable": false
+  },
   "infrastructurename": "simphera",
   "ingress_nginx_config": {
     "enable": false

diff --git a/terraform.tfvars.example b/terraform.tfvars.example
@@ -61,8 +61,13 @@ gpuNodeSize = [
   "g5.2xlarge"
 ]
 
-# The NVIDIA driver version for GPU node group.
-gpuNvidiaDriverVersion = "535.54.03"
+# Input configuration for the GPU operator chart deployed with helm release.
+# By setting key 'enable' to 'true', GPU operator will be deployed.
+# 'helm_repository' is an URL for the repository of the GPU operator helm chart, where 'helm_version' is its respective version of a chart.
+# 'chart_values' is used for changing default values.yaml of the GPU operator chart.
+gpu_operator_config = {
+  "enable": false
+}
 
 # The name of the infrastructure. e.g. simphera-infra
 infrastructurename = "simphera"

diff --git a/variables.tf b/variables.tf
@@ -118,12 +118,6 @@ variable "ivsGpuNodeDiskSize" {
   default     = 100
 }
 
-variable "gpuNvidiaDriverVersion" {
-  type        = string
-  description = "The NVIDIA driver version for GPU node group."
-  default     = "535.54.03"
-}
-
 variable "licenseServer" {
   type        = bool
   description = "Specifies whether a license server VM will be created."
@@ -375,3 +369,51 @@ variable "aws_load_balancer_controller_config" {
     enable = false
   }
 }
+
+variable "gpu_operator_config" {
+  type = object({
+    enable          = optional(bool, true)
+    helm_repository = optional(string, "https://helm.ngc.nvidia.com/nvidia")
+    helm_version    = optional(string, "v24.9.0")
+    driver_version  = optional(string, "550.90.07")
+    chart_values = optional(string, <<-YAML
+operator:
+  defaultRuntime: containerd
+
+dcgmExporter:
+  enabled: false
+
+driver:
+  enabled: true
+
+validator:
+  driver:
+    env:
+    - name: DISABLE_DEV_CHAR_SYMLINK_CREATION
+      value: "true"
+
+toolkit:
+  enabled: true
+
+daemonsets:
+  tolerations:
+  - key: purpose
+    value: gpu
+    operator: Equal
+    effect: NoSchedule
+
+node-feature-discovery:
+  worker:
+    tolerations:
+    - key: purpose
+      value: gpu
+      operator: Equal
+      effect: NoSchedule
+YAML
+    )
+  })
+  description = "Input configuration for the GPU operator chart deployed with helm release. By setting key 'enable' to 'true', GPU operator will be deployed. 'helm_repository' is an URL for the repository of the GPU operator helm chart, where 'helm_version' is its respective version of a chart. 'chart_values' is used for changing default values.yaml of the GPU operator chart."
+  default = {
+    enable = false
+  }
+}