From 7948faa51dc417a23eb1989dab6bca3ae5a318fc Mon Sep 17 00:00:00 2001 From: Christian Bergen Date: Thu, 28 Nov 2024 09:55:00 +0100 Subject: [PATCH 01/29] add another ami for gpu usage which supports using GPU operator --- locals.tf | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/locals.tf b/locals.tf index f58774d..3929415 100644 --- a/locals.tf +++ b/locals.tf @@ -3,7 +3,7 @@ data "aws_ami" "al2gpu_ami" { most_recent = true filter { name = "name" - values = ["*amazon-eks-gpu-node-${var.kubernetesVersion}*"] + values = ["*ubuntu-eks/k8s_${var.kubernetesVersion}/images/hvm-ssd/ubuntu-jammy-22.04-amd64-server*"] } } @@ -30,7 +30,6 @@ locals { private_subnets = local.create_vpc ? module.vpc[0].private_subnets : (local.use_private_subnets_ids ? var.private_subnet_ids : [for s in data.aws_subnet.private_subnet : s.id]) public_subnets = local.create_vpc ? module.vpc[0].public_subnets : (local.use_public_subnet_ids ? var.public_subnet_ids : [for s in data.aws_subnet.public_subnet : s.id]) # Using a one-line command for gpuPostUserData to avoid issues due to different line endings between Windows and Linux. - gpuPostUserData = "sudo yum -y erase nvidia-driver \nsudo yum -y install make gcc \nsudo yum -y update \nsudo yum -y install gcc kernel-devel-$(uname -r) \nsudo curl -fSsl -O https://us.download.nvidia.com/tesla/${var.gpuNvidiaDriverVersion}/NVIDIA-Linux-x86_64-${var.gpuNvidiaDriverVersion}.run \nsudo chmod +x NVIDIA-Linux-x86_64*.run \nsudo CC=/usr/bin/gcc10-cc ./NVIDIA-Linux-x86_64*.run -s --no-dkms --install-libglvnd \nsudo touch /etc/modprobe.d/nvidia.conf \necho \"options nvidia NVreg_EnableGpuFirmware=0\" | sudo tee --append /etc/modprobe.d/nvidia.conf \nsudo reboot" default_managed_node_pools = { "default" = { @@ -74,7 +73,12 @@ locals { disk_size = var.gpuNodeDiskSize custom_ami_id = data.aws_ami.al2gpu_ami.image_id create_launch_template = true - post_userdata = local.gpuPostUserData + block_device_mappings = [{ + device_name = "/dev/sda1" + volume_type = "gp2" + volume_size = 128 + delete_on_termination = true + }] k8s_labels = { "purpose" = "gpu" } @@ -99,7 +103,12 @@ locals { disk_size = var.ivsGpuNodeDiskSize custom_ami_id = data.aws_ami.al2gpu_ami.image_id create_launch_template = true - post_userdata = local.gpuPostUserData + block_device_mappings = [{ + device_name = "/dev/sda1" + volume_type = "gp2" + volume_size = 128 + delete_on_termination = true + }] k8s_labels = { "product" = "ivs", "purpose" = "gpu" From dfb0fbb087b38a4eb16a99f3f703408437fe21fa Mon Sep 17 00:00:00 2001 From: Christian Bergen Date: Thu, 28 Nov 2024 13:33:48 +0100 Subject: [PATCH 02/29] remove variable gpuNvidiaDriverVersion --- variables.tf | 6 ------ 1 file changed, 6 deletions(-) diff --git a/variables.tf b/variables.tf index 7149dc8..05e191b 100644 --- a/variables.tf +++ b/variables.tf @@ -118,12 +118,6 @@ variable "ivsGpuNodeDiskSize" { default = 100 } -variable "gpuNvidiaDriverVersion" { - type = string - description = "The NVIDIA driver version for GPU node group." - default = "535.54.03" -} - variable "licenseServer" { type = bool description = "Specifies whether a license server VM will be created." From 82b08618bf37a66656e025c1d9b5898d969ff2f8 Mon Sep 17 00:00:00 2001 From: Christian Bergen Date: Mon, 2 Dec 2024 15:37:38 +0100 Subject: [PATCH 03/29] add gpu operator as eks-addon --- k8s-eks-addons.tf | 1 + modules/k8s_eks_addons/gpu-operator.tf | 23 ++++++++++++ modules/k8s_eks_addons/variables.tf | 10 ++++++ variables.tf | 48 ++++++++++++++++++++++++++ 4 files changed, 82 insertions(+) create mode 100644 modules/k8s_eks_addons/gpu-operator.tf diff --git a/k8s-eks-addons.tf b/k8s-eks-addons.tf index 0f77f4c..3c06ff0 100644 --- a/k8s-eks-addons.tf +++ b/k8s-eks-addons.tf @@ -6,6 +6,7 @@ module "k8s_eks_addons" { coredns_config = var.coredns_config s3_csi_config = var.s3_csi_config aws_load_balancer_controller_config = var.aws_load_balancer_controller_config + gpu_operator_config = var.gpu_operator_config addon_context = { aws_caller_identity_account_id = data.aws_caller_identity.current.account_id diff --git a/modules/k8s_eks_addons/gpu-operator.tf b/modules/k8s_eks_addons/gpu-operator.tf new file mode 100644 index 0000000..79ad623 --- /dev/null +++ b/modules/k8s_eks_addons/gpu-operator.tf @@ -0,0 +1,23 @@ +resource "kubernetes_namespace_v1" "gpu_operator" { + count = var.gpu_operator_config.enable ? 1 : 0 + + metadata { + name = "gpu-operator" + } +} + +resource "helm_release" "gpu_operator" { + count = var.gpu_operator_config.enable ? 1 : 0 + + namespace = kubernetes_namespace_v1.gpu_operator[0].metadata[0].name + name = "gpu-operator" + chart = "gpu-operator" + repository = var.gpu_operator_config.helm_repository + version = var.gpu_operator_config.helm_version + description = "The GPU operator HelmChart deployment configuration" + dependency_update = true + values = [ + var.gpu_operator_config.chart_values + ] + timeout = 1200 +} diff --git a/modules/k8s_eks_addons/variables.tf b/modules/k8s_eks_addons/variables.tf index 39ced2a..21498d5 100644 --- a/modules/k8s_eks_addons/variables.tf +++ b/modules/k8s_eks_addons/variables.tf @@ -49,3 +49,13 @@ variable "s3_csi_config" { }) description = "Input configuration for AWS EKS add-on aws-mountpoint-s3-csi-driver." } + +variable "gpu_operator_config" { + description = "GPU operator configuration" + type = object({ + enable = bool + helm_repository = string + helm_version = string + chart_values = string + }) +} diff --git a/variables.tf b/variables.tf index 05e191b..7acfa10 100644 --- a/variables.tf +++ b/variables.tf @@ -369,3 +369,51 @@ variable "aws_load_balancer_controller_config" { enable = false } } + +variable "gpu_operator_config" { + type = object({ + enable = optional(bool, true) + helm_repository = optional(string, "https://helm.ngc.nvidia.com/nvidia") + helm_version = optional(string, "v24.9.0") + chart_values = optional(string, <<-YAML +operator: + defaultRuntime: containerd + +dcgmExporter: + enabled: false + +driver: + enabled: true + version: "550.90.07" + +validator: + driver: + env: + - name: DISABLE_DEV_CHAR_SYMLINK_CREATION + value: "true" + +toolkit: + enabled: true + +daemonsets: + tolerations: + - key: purpose + value: gpu + operator: Equal + effect: NoSchedule + +node-feature-discovery: + worker: + tolerations: + - key: purpose + value: gpu + operator: Equal + effect: NoSchedule +YAML + ) + }) + description = "Input configuration for GPU operator deployed with helm release. By setting key 'enable' to 'true', GPU operator will be deployed. 'helm_repository' is an URL for the repository of GPU operator helm chart, where 'helm_version' is its respective version of a chart. 'chart_values' is used for changing default values.yaml of an GPU operator chart." + default = { + enable = false + } +} From c6db00526bb7f7a34d6e384983b5feaaa922a942 Mon Sep 17 00:00:00 2001 From: Christian Bergen Date: Mon, 2 Dec 2024 15:55:35 +0100 Subject: [PATCH 04/29] replace namespace creation resource --- modules/k8s_eks_addons/gpu-operator.tf | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/modules/k8s_eks_addons/gpu-operator.tf b/modules/k8s_eks_addons/gpu-operator.tf index 79ad623..aa6a920 100644 --- a/modules/k8s_eks_addons/gpu-operator.tf +++ b/modules/k8s_eks_addons/gpu-operator.tf @@ -1,17 +1,10 @@ -resource "kubernetes_namespace_v1" "gpu_operator" { - count = var.gpu_operator_config.enable ? 1 : 0 - - metadata { - name = "gpu-operator" - } -} - resource "helm_release" "gpu_operator" { count = var.gpu_operator_config.enable ? 1 : 0 namespace = kubernetes_namespace_v1.gpu_operator[0].metadata[0].name name = "gpu-operator" chart = "gpu-operator" + create_namespace = true repository = var.gpu_operator_config.helm_repository version = var.gpu_operator_config.helm_version description = "The GPU operator HelmChart deployment configuration" From 61b20159f4058e8cac42206af54c751bb05626cd Mon Sep 17 00:00:00 2001 From: Christian Bergen Date: Mon, 2 Dec 2024 15:58:12 +0100 Subject: [PATCH 05/29] hardcode namespace --- modules/k8s_eks_addons/gpu-operator.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/k8s_eks_addons/gpu-operator.tf b/modules/k8s_eks_addons/gpu-operator.tf index aa6a920..6578ac8 100644 --- a/modules/k8s_eks_addons/gpu-operator.tf +++ b/modules/k8s_eks_addons/gpu-operator.tf @@ -1,7 +1,7 @@ resource "helm_release" "gpu_operator" { count = var.gpu_operator_config.enable ? 1 : 0 - namespace = kubernetes_namespace_v1.gpu_operator[0].metadata[0].name + namespace = "gpu-operator" name = "gpu-operator" chart = "gpu-operator" create_namespace = true From 57f179d8a7cf7f1cf1dbadd516b976d1ea91ed0a Mon Sep 17 00:00:00 2001 From: Christian Bergen Date: Tue, 3 Dec 2024 08:30:16 +0100 Subject: [PATCH 06/29] add driver version and remove default chart values --- modules/k8s_eks_addons/gpu-operator.tf | 5 ++++ variables.tf | 36 +++----------------------- 2 files changed, 8 insertions(+), 33 deletions(-) diff --git a/modules/k8s_eks_addons/gpu-operator.tf b/modules/k8s_eks_addons/gpu-operator.tf index 6578ac8..ceb8b2f 100644 --- a/modules/k8s_eks_addons/gpu-operator.tf +++ b/modules/k8s_eks_addons/gpu-operator.tf @@ -13,4 +13,9 @@ resource "helm_release" "gpu_operator" { var.gpu_operator_config.chart_values ] timeout = 1200 + set { + name = "driver.version" + value = var.gpu_operator_config.driver_version + } + } diff --git a/variables.tf b/variables.tf index 7acfa10..40e79d6 100644 --- a/variables.tf +++ b/variables.tf @@ -375,40 +375,9 @@ variable "gpu_operator_config" { enable = optional(bool, true) helm_repository = optional(string, "https://helm.ngc.nvidia.com/nvidia") helm_version = optional(string, "v24.9.0") + driver_version = string chart_values = optional(string, <<-YAML -operator: - defaultRuntime: containerd - -dcgmExporter: - enabled: false - -driver: - enabled: true - version: "550.90.07" - -validator: - driver: - env: - - name: DISABLE_DEV_CHAR_SYMLINK_CREATION - value: "true" - -toolkit: - enabled: true - -daemonsets: - tolerations: - - key: purpose - value: gpu - operator: Equal - effect: NoSchedule - -node-feature-discovery: - worker: - tolerations: - - key: purpose - value: gpu - operator: Equal - effect: NoSchedule + YAML ) }) @@ -417,3 +386,4 @@ YAML enable = false } } + From e775b2b741c081091c949623f9e95701ff26ace8 Mon Sep 17 00:00:00 2001 From: Christian Bergen Date: Tue, 3 Dec 2024 08:46:27 +0100 Subject: [PATCH 07/29] remove default --- variables.tf | 3 --- 1 file changed, 3 deletions(-) diff --git a/variables.tf b/variables.tf index 40e79d6..865cf89 100644 --- a/variables.tf +++ b/variables.tf @@ -382,8 +382,5 @@ YAML ) }) description = "Input configuration for GPU operator deployed with helm release. By setting key 'enable' to 'true', GPU operator will be deployed. 'helm_repository' is an URL for the repository of GPU operator helm chart, where 'helm_version' is its respective version of a chart. 'chart_values' is used for changing default values.yaml of an GPU operator chart." - default = { - enable = false - } } From 901af33b79f06c02404cbc2008ae19b1ed2e7572 Mon Sep 17 00:00:00 2001 From: Christian Bergen Date: Tue, 3 Dec 2024 08:57:29 +0100 Subject: [PATCH 08/29] add driver_version to variables.tf of the eks-addons --- modules/k8s_eks_addons/variables.tf | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/k8s_eks_addons/variables.tf b/modules/k8s_eks_addons/variables.tf index 21498d5..81003e2 100644 --- a/modules/k8s_eks_addons/variables.tf +++ b/modules/k8s_eks_addons/variables.tf @@ -57,5 +57,6 @@ variable "gpu_operator_config" { helm_repository = string helm_version = string chart_values = string + driver_version = string }) } From 1c13fb48a632ef2fecf51c3bdcf5dce9faf43b57 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 3 Dec 2024 14:51:13 +0000 Subject: [PATCH 09/29] terraform-docs: automated action --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 625ac46..ccdbda3 100644 --- a/README.md +++ b/README.md @@ -554,7 +554,7 @@ Encryption is enabled at all AWS resources that are created by Terraform: | [gpuNodeDiskSize](#input\_gpuNodeDiskSize) | The disk size in GiB of the nodes for the gpu job execution | `number` | `100` | no | | [gpuNodePool](#input\_gpuNodePool) | Specifies whether an additional node pool for gpu job execution is added to the kubernetes cluster | `bool` | `false` | no | | [gpuNodeSize](#input\_gpuNodeSize) | The machine size of the nodes for the gpu job execution | `list(string)` |
[
"g5.2xlarge"
]
| no | -| [gpuNvidiaDriverVersion](#input\_gpuNvidiaDriverVersion) | The NVIDIA driver version for GPU node group. | `string` | `"535.54.03"` | no | +| [gpu\_operator\_config](#input\_gpu\_operator\_config) | Input configuration for GPU operator deployed with helm release. By setting key 'enable' to 'true', GPU operator will be deployed. 'helm\_repository' is an URL for the repository of GPU operator helm chart, where 'helm\_version' is its respective version of a chart. 'chart\_values' is used for changing default values.yaml of an GPU operator chart. |
object({
enable = optional(bool, true)
helm_repository = optional(string, "https://helm.ngc.nvidia.com/nvidia")
helm_version = optional(string, "v24.9.0")
driver_version = string
chart_values = optional(string, <<-YAML

YAML
)
})
| n/a | yes | | [infrastructurename](#input\_infrastructurename) | The name of the infrastructure. e.g. simphera-infra | `string` | `"simphera"` | no | | [ingress\_nginx\_config](#input\_ingress\_nginx\_config) | Input configuration for ingress-nginx service deployed with helm release. By setting key 'enable' to 'true', ingress-nginx service will be deployed. 'helm\_repository' is an URL for the repository of ingress-nginx helm chart, where 'helm\_version' is its respective version of a chart. 'chart\_values' is used for changing default values.yaml of an ingress-nginx chart. |
object({
enable = bool
helm_repository = optional(string, "https://kubernetes.github.io/ingress-nginx")
helm_version = optional(string, "4.1.4")
chart_values = optional(string, <<-YAML
controller:
images:
registry: "registry.k8s.io"
service:
annotations:
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
YAML
)
})
|
{
"enable": false
}
| no | | [install\_schedule](#input\_install\_schedule) | 6-field Cron expression describing the install maintenance schedule. Must not overlap with variable scan\_schedule. | `string` | `"cron(0 3 * * ? *)"` | no | From a1419824cd55e65276f480a8ddcc5ae622f92d2a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 3 Dec 2024 14:51:15 +0000 Subject: [PATCH 10/29] terraform-docs: automated action --- terraform.tfvars.example | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/terraform.tfvars.example b/terraform.tfvars.example index 865b793..b3d3909 100644 --- a/terraform.tfvars.example +++ b/terraform.tfvars.example @@ -61,8 +61,11 @@ gpuNodeSize = [ "g5.2xlarge" ] -# The NVIDIA driver version for GPU node group. -gpuNvidiaDriverVersion = "535.54.03" +# Input configuration for GPU operator deployed with helm release. +# By setting key 'enable' to 'true', GPU operator will be deployed. +# 'helm_repository' is an URL for the repository of GPU operator helm chart, where 'helm_version' is its respective version of a chart. +# 'chart_values' is used for changing default values.yaml of an GPU operator chart. +gpu_operator_config = # The name of the infrastructure. e.g. simphera-infra infrastructurename = "simphera" From b8ac7e3ac676db3286ecbc3ec6fd49acdfa9c8be Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 3 Dec 2024 14:51:16 +0000 Subject: [PATCH 11/29] terraform-docs: automated action --- terraform.json.example | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform.json.example b/terraform.json.example index 1c48f79..ba19074 100644 --- a/terraform.json.example +++ b/terraform.json.example @@ -21,7 +21,7 @@ "gpuNodeSize": [ "g5.2xlarge" ], - "gpuNvidiaDriverVersion": "535.54.03", + "gpu_operator_config": null, "infrastructurename": "simphera", "ingress_nginx_config": { "enable": false From 74a723d738e767119f57e119676fa413cb317233 Mon Sep 17 00:00:00 2001 From: Christian Bergen Date: Tue, 3 Dec 2024 15:57:52 +0100 Subject: [PATCH 12/29] add to description --- variables.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/variables.tf b/variables.tf index 865cf89..87669ce 100644 --- a/variables.tf +++ b/variables.tf @@ -381,6 +381,6 @@ variable "gpu_operator_config" { YAML ) }) - description = "Input configuration for GPU operator deployed with helm release. By setting key 'enable' to 'true', GPU operator will be deployed. 'helm_repository' is an URL for the repository of GPU operator helm chart, where 'helm_version' is its respective version of a chart. 'chart_values' is used for changing default values.yaml of an GPU operator chart." + description = "Input configuration for the GPU operator chart deployed with helm release. By setting key 'enable' to 'true', GPU operator will be deployed. 'helm_repository' is an URL for the repository of the GPU operator helm chart, where 'helm_version' is its respective version of a chart. 'chart_values' is used for changing default values.yaml of the GPU operator chart." } From a7b235011358fa37f7af387042b81ecd1ca3d3ad Mon Sep 17 00:00:00 2001 From: Christian Bergen Date: Tue, 3 Dec 2024 15:58:33 +0100 Subject: [PATCH 13/29] remove empty line --- variables.tf | 1 - 1 file changed, 1 deletion(-) diff --git a/variables.tf b/variables.tf index 87669ce..c33e8b8 100644 --- a/variables.tf +++ b/variables.tf @@ -383,4 +383,3 @@ YAML }) description = "Input configuration for the GPU operator chart deployed with helm release. By setting key 'enable' to 'true', GPU operator will be deployed. 'helm_repository' is an URL for the repository of the GPU operator helm chart, where 'helm_version' is its respective version of a chart. 'chart_values' is used for changing default values.yaml of the GPU operator chart." } - From 15ca2bbe5575a48fee8952c74169365b6c02d122 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 3 Dec 2024 14:59:37 +0000 Subject: [PATCH 14/29] terraform-docs: automated action --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ccdbda3..903cb55 100644 --- a/README.md +++ b/README.md @@ -554,7 +554,7 @@ Encryption is enabled at all AWS resources that are created by Terraform: | [gpuNodeDiskSize](#input\_gpuNodeDiskSize) | The disk size in GiB of the nodes for the gpu job execution | `number` | `100` | no | | [gpuNodePool](#input\_gpuNodePool) | Specifies whether an additional node pool for gpu job execution is added to the kubernetes cluster | `bool` | `false` | no | | [gpuNodeSize](#input\_gpuNodeSize) | The machine size of the nodes for the gpu job execution | `list(string)` |
[
"g5.2xlarge"
]
| no | -| [gpu\_operator\_config](#input\_gpu\_operator\_config) | Input configuration for GPU operator deployed with helm release. By setting key 'enable' to 'true', GPU operator will be deployed. 'helm\_repository' is an URL for the repository of GPU operator helm chart, where 'helm\_version' is its respective version of a chart. 'chart\_values' is used for changing default values.yaml of an GPU operator chart. |
object({
enable = optional(bool, true)
helm_repository = optional(string, "https://helm.ngc.nvidia.com/nvidia")
helm_version = optional(string, "v24.9.0")
driver_version = string
chart_values = optional(string, <<-YAML

YAML
)
})
| n/a | yes | +| [gpu\_operator\_config](#input\_gpu\_operator\_config) | Input configuration for the GPU operator chart deployed with helm release. By setting key 'enable' to 'true', GPU operator will be deployed. 'helm\_repository' is an URL for the repository of the GPU operator helm chart, where 'helm\_version' is its respective version of a chart. 'chart\_values' is used for changing default values.yaml of the GPU operator chart. |
object({
enable = optional(bool, true)
helm_repository = optional(string, "https://helm.ngc.nvidia.com/nvidia")
helm_version = optional(string, "v24.9.0")
driver_version = string
chart_values = optional(string, <<-YAML

YAML
)
})
| n/a | yes | | [infrastructurename](#input\_infrastructurename) | The name of the infrastructure. e.g. simphera-infra | `string` | `"simphera"` | no | | [ingress\_nginx\_config](#input\_ingress\_nginx\_config) | Input configuration for ingress-nginx service deployed with helm release. By setting key 'enable' to 'true', ingress-nginx service will be deployed. 'helm\_repository' is an URL for the repository of ingress-nginx helm chart, where 'helm\_version' is its respective version of a chart. 'chart\_values' is used for changing default values.yaml of an ingress-nginx chart. |
object({
enable = bool
helm_repository = optional(string, "https://kubernetes.github.io/ingress-nginx")
helm_version = optional(string, "4.1.4")
chart_values = optional(string, <<-YAML
controller:
images:
registry: "registry.k8s.io"
service:
annotations:
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
YAML
)
})
|
{
"enable": false
}
| no | | [install\_schedule](#input\_install\_schedule) | 6-field Cron expression describing the install maintenance schedule. Must not overlap with variable scan\_schedule. | `string` | `"cron(0 3 * * ? *)"` | no | From ab5ae14817d828c692b5f64753b1455e56549270 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 3 Dec 2024 14:59:39 +0000 Subject: [PATCH 15/29] terraform-docs: automated action --- terraform.tfvars.example | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/terraform.tfvars.example b/terraform.tfvars.example index b3d3909..6f8f4d1 100644 --- a/terraform.tfvars.example +++ b/terraform.tfvars.example @@ -61,10 +61,10 @@ gpuNodeSize = [ "g5.2xlarge" ] -# Input configuration for GPU operator deployed with helm release. +# Input configuration for the GPU operator chart deployed with helm release. # By setting key 'enable' to 'true', GPU operator will be deployed. -# 'helm_repository' is an URL for the repository of GPU operator helm chart, where 'helm_version' is its respective version of a chart. -# 'chart_values' is used for changing default values.yaml of an GPU operator chart. +# 'helm_repository' is an URL for the repository of the GPU operator helm chart, where 'helm_version' is its respective version of a chart. +# 'chart_values' is used for changing default values.yaml of the GPU operator chart. gpu_operator_config = # The name of the infrastructure. e.g. simphera-infra From 2c5274a949b6348794153b20c7bcbaea40cbc0c3 Mon Sep 17 00:00:00 2001 From: Christian Bergen Date: Tue, 3 Dec 2024 16:46:55 +0100 Subject: [PATCH 16/29] add default gpu operator helm values --- variables.tf | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/variables.tf b/variables.tf index c33e8b8..0197e68 100644 --- a/variables.tf +++ b/variables.tf @@ -375,9 +375,40 @@ variable "gpu_operator_config" { enable = optional(bool, true) helm_repository = optional(string, "https://helm.ngc.nvidia.com/nvidia") helm_version = optional(string, "v24.9.0") - driver_version = string + driver_version = optional(string, "550.90.07") chart_values = optional(string, <<-YAML - +operator: + defaultRuntime: containerd + +dcgmExporter: + enabled: false + +driver: + enabled: true + +validator: + driver: + env: + - name: DISABLE_DEV_CHAR_SYMLINK_CREATION + value: "true" + +toolkit: + enabled: true + +daemonsets: + tolerations: + - key: purpose + value: gpu + operator: Equal + effect: NoSchedule + +node-feature-discovery: + worker: + tolerations: + - key: purpose + value: gpu + operator: Equal + effect: NoSchedule YAML ) }) From f56b3c145fcbe4cd2293e2cf795808081bb72e81 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 3 Dec 2024 15:47:59 +0000 Subject: [PATCH 17/29] terraform-docs: automated action --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 903cb55..7518889 100644 --- a/README.md +++ b/README.md @@ -554,7 +554,7 @@ Encryption is enabled at all AWS resources that are created by Terraform: | [gpuNodeDiskSize](#input\_gpuNodeDiskSize) | The disk size in GiB of the nodes for the gpu job execution | `number` | `100` | no | | [gpuNodePool](#input\_gpuNodePool) | Specifies whether an additional node pool for gpu job execution is added to the kubernetes cluster | `bool` | `false` | no | | [gpuNodeSize](#input\_gpuNodeSize) | The machine size of the nodes for the gpu job execution | `list(string)` |
[
"g5.2xlarge"
]
| no | -| [gpu\_operator\_config](#input\_gpu\_operator\_config) | Input configuration for the GPU operator chart deployed with helm release. By setting key 'enable' to 'true', GPU operator will be deployed. 'helm\_repository' is an URL for the repository of the GPU operator helm chart, where 'helm\_version' is its respective version of a chart. 'chart\_values' is used for changing default values.yaml of the GPU operator chart. |
object({
enable = optional(bool, true)
helm_repository = optional(string, "https://helm.ngc.nvidia.com/nvidia")
helm_version = optional(string, "v24.9.0")
driver_version = string
chart_values = optional(string, <<-YAML

YAML
)
})
| n/a | yes | +| [gpu\_operator\_config](#input\_gpu\_operator\_config) | Input configuration for the GPU operator chart deployed with helm release. By setting key 'enable' to 'true', GPU operator will be deployed. 'helm\_repository' is an URL for the repository of the GPU operator helm chart, where 'helm\_version' is its respective version of a chart. 'chart\_values' is used for changing default values.yaml of the GPU operator chart. |
object({
enable = optional(bool, true)
helm_repository = optional(string, "https://helm.ngc.nvidia.com/nvidia")
helm_version = optional(string, "v24.9.0")
driver_version = optional(string, "550.90.07")
chart_values = optional(string, <<-YAML
operator:
defaultRuntime: containerd

dcgmExporter:
enabled: false

driver:
enabled: true

validator:
driver:
env:
- name: DISABLE_DEV_CHAR_SYMLINK_CREATION
value: "true"

toolkit:
enabled: true

daemonsets:
tolerations:
- key: purpose
value: gpu
operator: Equal
effect: NoSchedule

node-feature-discovery:
worker:
tolerations:
- key: purpose
value: gpu
operator: Equal
effect: NoSchedule
YAML
)
})
| n/a | yes | | [infrastructurename](#input\_infrastructurename) | The name of the infrastructure. e.g. simphera-infra | `string` | `"simphera"` | no | | [ingress\_nginx\_config](#input\_ingress\_nginx\_config) | Input configuration for ingress-nginx service deployed with helm release. By setting key 'enable' to 'true', ingress-nginx service will be deployed. 'helm\_repository' is an URL for the repository of ingress-nginx helm chart, where 'helm\_version' is its respective version of a chart. 'chart\_values' is used for changing default values.yaml of an ingress-nginx chart. |
object({
enable = bool
helm_repository = optional(string, "https://kubernetes.github.io/ingress-nginx")
helm_version = optional(string, "4.1.4")
chart_values = optional(string, <<-YAML
controller:
images:
registry: "registry.k8s.io"
service:
annotations:
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
YAML
)
})
|
{
"enable": false
}
| no | | [install\_schedule](#input\_install\_schedule) | 6-field Cron expression describing the install maintenance schedule. Must not overlap with variable scan\_schedule. | `string` | `"cron(0 3 * * ? *)"` | no | From af1916c0cef3fde1dd61902b5fd0a78ad6eda579 Mon Sep 17 00:00:00 2001 From: Christian Bergen Date: Thu, 5 Dec 2024 11:48:28 +0100 Subject: [PATCH 18/29] remove launch template creation --- locals.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/locals.tf b/locals.tf index 3929415..773e656 100644 --- a/locals.tf +++ b/locals.tf @@ -72,7 +72,7 @@ locals { min_size = var.gpuNodeCountMin disk_size = var.gpuNodeDiskSize custom_ami_id = data.aws_ami.al2gpu_ami.image_id - create_launch_template = true + create_launch_template = false block_device_mappings = [{ device_name = "/dev/sda1" volume_type = "gp2" From 77525890e8521bc97f162ae929a92f22e4b25e73 Mon Sep 17 00:00:00 2001 From: Christian Bergen Date: Thu, 5 Dec 2024 13:06:24 +0100 Subject: [PATCH 19/29] enable launch template --- locals.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/locals.tf b/locals.tf index 773e656..3929415 100644 --- a/locals.tf +++ b/locals.tf @@ -72,7 +72,7 @@ locals { min_size = var.gpuNodeCountMin disk_size = var.gpuNodeDiskSize custom_ami_id = data.aws_ami.al2gpu_ami.image_id - create_launch_template = false + create_launch_template = true block_device_mappings = [{ device_name = "/dev/sda1" volume_type = "gp2" From 498e43d9bd500f969b8cc076fa5b97776c884ea8 Mon Sep 17 00:00:00 2001 From: Christian Bergen Date: Thu, 5 Dec 2024 14:00:45 +0100 Subject: [PATCH 20/29] try another way --- locals.tf | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/locals.tf b/locals.tf index 3929415..df360e5 100644 --- a/locals.tf +++ b/locals.tf @@ -59,10 +59,7 @@ locals { "effect" = "NO_SCHEDULE" } ] - } - } - - gpu_node_pool = { + }, "gpuexecnodes" = { node_group_name = "gpuexecnodes" instance_types = var.gpuNodeSize @@ -92,6 +89,10 @@ locals { } } + # gpu_node_pool = { + + # } + ivsgpu_node_pool = { "gpuivsnodes" = { node_group_name = "gpuivsnodes" From e4adfb94c390743f433d2d48750d6611029ae91b Mon Sep 17 00:00:00 2001 From: Christian Bergen Date: Thu, 5 Dec 2024 14:15:51 +0100 Subject: [PATCH 21/29] test with false template --- locals.tf | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/locals.tf b/locals.tf index df360e5..773e656 100644 --- a/locals.tf +++ b/locals.tf @@ -59,7 +59,10 @@ locals { "effect" = "NO_SCHEDULE" } ] - }, + } + } + + gpu_node_pool = { "gpuexecnodes" = { node_group_name = "gpuexecnodes" instance_types = var.gpuNodeSize @@ -69,7 +72,7 @@ locals { min_size = var.gpuNodeCountMin disk_size = var.gpuNodeDiskSize custom_ami_id = data.aws_ami.al2gpu_ami.image_id - create_launch_template = true + create_launch_template = false block_device_mappings = [{ device_name = "/dev/sda1" volume_type = "gp2" @@ -89,10 +92,6 @@ locals { } } - # gpu_node_pool = { - - # } - ivsgpu_node_pool = { "gpuivsnodes" = { node_group_name = "gpuivsnodes" From 8471a52b0edeee73e53e1e76fbf232f4859e642e Mon Sep 17 00:00:00 2001 From: Christian Bergen Date: Thu, 5 Dec 2024 14:17:19 +0100 Subject: [PATCH 22/29] reset to true --- locals.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/locals.tf b/locals.tf index 773e656..3929415 100644 --- a/locals.tf +++ b/locals.tf @@ -72,7 +72,7 @@ locals { min_size = var.gpuNodeCountMin disk_size = var.gpuNodeDiskSize custom_ami_id = data.aws_ami.al2gpu_ami.image_id - create_launch_template = false + create_launch_template = true block_device_mappings = [{ device_name = "/dev/sda1" volume_type = "gp2" From 1b06f076a410e92b4b8b58ea182e6b22dad09bd5 Mon Sep 17 00:00:00 2001 From: Christian Bergen Date: Fri, 6 Dec 2024 10:40:29 +0100 Subject: [PATCH 23/29] fix description --- terraform.tfvars.example | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/terraform.tfvars.example b/terraform.tfvars.example index 6f8f4d1..7631570 100644 --- a/terraform.tfvars.example +++ b/terraform.tfvars.example @@ -65,7 +65,17 @@ gpuNodeSize = [ # By setting key 'enable' to 'true', GPU operator will be deployed. # 'helm_repository' is an URL for the repository of the GPU operator helm chart, where 'helm_version' is its respective version of a chart. # 'chart_values' is used for changing default values.yaml of the GPU operator chart. -gpu_operator_config = +gpu_operator_config = { + type = object({ + enable = optional(bool, true) + helm_repository = optional(string, "https://helm.ngc.nvidia.com/nvidia") + helm_version = optional(string, "v24.9.0") + driver_version = optional(string, "550.90.07") + chart_values = optional(string, <<-YAML +YAML + ) + }) +} # The name of the infrastructure. e.g. simphera-infra infrastructurename = "simphera" From 7fad415e507cb9425a00d100cc57c106a7f7236f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 6 Dec 2024 09:41:39 +0000 Subject: [PATCH 24/29] terraform-docs: automated action --- terraform.tfvars.example | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/terraform.tfvars.example b/terraform.tfvars.example index 7631570..6f8f4d1 100644 --- a/terraform.tfvars.example +++ b/terraform.tfvars.example @@ -65,17 +65,7 @@ gpuNodeSize = [ # By setting key 'enable' to 'true', GPU operator will be deployed. # 'helm_repository' is an URL for the repository of the GPU operator helm chart, where 'helm_version' is its respective version of a chart. # 'chart_values' is used for changing default values.yaml of the GPU operator chart. -gpu_operator_config = { - type = object({ - enable = optional(bool, true) - helm_repository = optional(string, "https://helm.ngc.nvidia.com/nvidia") - helm_version = optional(string, "v24.9.0") - driver_version = optional(string, "550.90.07") - chart_values = optional(string, <<-YAML -YAML - ) - }) -} +gpu_operator_config = # The name of the infrastructure. e.g. simphera-infra infrastructurename = "simphera" From b3eb332e65734d91883cdea49c004ebc3e6845d5 Mon Sep 17 00:00:00 2001 From: Christian Bergen Date: Fri, 6 Dec 2024 10:42:56 +0100 Subject: [PATCH 25/29] try gp3 and change namespace --- locals.tf | 4 ++-- modules/k8s_eks_addons/gpu-operator.tf | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/locals.tf b/locals.tf index 3929415..908780c 100644 --- a/locals.tf +++ b/locals.tf @@ -75,7 +75,7 @@ locals { create_launch_template = true block_device_mappings = [{ device_name = "/dev/sda1" - volume_type = "gp2" + volume_type = "gp3" volume_size = 128 delete_on_termination = true }] @@ -105,7 +105,7 @@ locals { create_launch_template = true block_device_mappings = [{ device_name = "/dev/sda1" - volume_type = "gp2" + volume_type = "gp3" volume_size = 128 delete_on_termination = true }] diff --git a/modules/k8s_eks_addons/gpu-operator.tf b/modules/k8s_eks_addons/gpu-operator.tf index ceb8b2f..a0a6f48 100644 --- a/modules/k8s_eks_addons/gpu-operator.tf +++ b/modules/k8s_eks_addons/gpu-operator.tf @@ -1,7 +1,7 @@ resource "helm_release" "gpu_operator" { count = var.gpu_operator_config.enable ? 1 : 0 - namespace = "gpu-operator" + namespace = "kube-system" name = "gpu-operator" chart = "gpu-operator" create_namespace = true From 34ef8d554b97be6f3e9e8914710f7cd9d5384545 Mon Sep 17 00:00:00 2001 From: Christian Bergen Date: Fri, 6 Dec 2024 13:32:01 +0100 Subject: [PATCH 26/29] add default --- variables.tf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/variables.tf b/variables.tf index 0197e68..7120f61 100644 --- a/variables.tf +++ b/variables.tf @@ -413,4 +413,7 @@ YAML ) }) description = "Input configuration for the GPU operator chart deployed with helm release. By setting key 'enable' to 'true', GPU operator will be deployed. 'helm_repository' is an URL for the repository of the GPU operator helm chart, where 'helm_version' is its respective version of a chart. 'chart_values' is used for changing default values.yaml of the GPU operator chart." + default = { + enable = false + } } From 40c3ad93a1d8f6429e80668d8d2e359d3dc6612a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 6 Dec 2024 12:33:02 +0000 Subject: [PATCH 27/29] terraform-docs: automated action --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7518889..ea7cf5c 100644 --- a/README.md +++ b/README.md @@ -554,7 +554,7 @@ Encryption is enabled at all AWS resources that are created by Terraform: | [gpuNodeDiskSize](#input\_gpuNodeDiskSize) | The disk size in GiB of the nodes for the gpu job execution | `number` | `100` | no | | [gpuNodePool](#input\_gpuNodePool) | Specifies whether an additional node pool for gpu job execution is added to the kubernetes cluster | `bool` | `false` | no | | [gpuNodeSize](#input\_gpuNodeSize) | The machine size of the nodes for the gpu job execution | `list(string)` |
[
"g5.2xlarge"
]
| no | -| [gpu\_operator\_config](#input\_gpu\_operator\_config) | Input configuration for the GPU operator chart deployed with helm release. By setting key 'enable' to 'true', GPU operator will be deployed. 'helm\_repository' is an URL for the repository of the GPU operator helm chart, where 'helm\_version' is its respective version of a chart. 'chart\_values' is used for changing default values.yaml of the GPU operator chart. |
object({
enable = optional(bool, true)
helm_repository = optional(string, "https://helm.ngc.nvidia.com/nvidia")
helm_version = optional(string, "v24.9.0")
driver_version = optional(string, "550.90.07")
chart_values = optional(string, <<-YAML
operator:
defaultRuntime: containerd

dcgmExporter:
enabled: false

driver:
enabled: true

validator:
driver:
env:
- name: DISABLE_DEV_CHAR_SYMLINK_CREATION
value: "true"

toolkit:
enabled: true

daemonsets:
tolerations:
- key: purpose
value: gpu
operator: Equal
effect: NoSchedule

node-feature-discovery:
worker:
tolerations:
- key: purpose
value: gpu
operator: Equal
effect: NoSchedule
YAML
)
})
| n/a | yes | +| [gpu\_operator\_config](#input\_gpu\_operator\_config) | Input configuration for the GPU operator chart deployed with helm release. By setting key 'enable' to 'true', GPU operator will be deployed. 'helm\_repository' is an URL for the repository of the GPU operator helm chart, where 'helm\_version' is its respective version of a chart. 'chart\_values' is used for changing default values.yaml of the GPU operator chart. |
object({
enable = optional(bool, true)
helm_repository = optional(string, "https://helm.ngc.nvidia.com/nvidia")
helm_version = optional(string, "v24.9.0")
driver_version = optional(string, "550.90.07")
chart_values = optional(string, <<-YAML
operator:
defaultRuntime: containerd

dcgmExporter:
enabled: false

driver:
enabled: true

validator:
driver:
env:
- name: DISABLE_DEV_CHAR_SYMLINK_CREATION
value: "true"

toolkit:
enabled: true

daemonsets:
tolerations:
- key: purpose
value: gpu
operator: Equal
effect: NoSchedule

node-feature-discovery:
worker:
tolerations:
- key: purpose
value: gpu
operator: Equal
effect: NoSchedule
YAML
)
})
|
{
"enable": false
}
| no | | [infrastructurename](#input\_infrastructurename) | The name of the infrastructure. e.g. simphera-infra | `string` | `"simphera"` | no | | [ingress\_nginx\_config](#input\_ingress\_nginx\_config) | Input configuration for ingress-nginx service deployed with helm release. By setting key 'enable' to 'true', ingress-nginx service will be deployed. 'helm\_repository' is an URL for the repository of ingress-nginx helm chart, where 'helm\_version' is its respective version of a chart. 'chart\_values' is used for changing default values.yaml of an ingress-nginx chart. |
object({
enable = bool
helm_repository = optional(string, "https://kubernetes.github.io/ingress-nginx")
helm_version = optional(string, "4.1.4")
chart_values = optional(string, <<-YAML
controller:
images:
registry: "registry.k8s.io"
service:
annotations:
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
YAML
)
})
|
{
"enable": false
}
| no | | [install\_schedule](#input\_install\_schedule) | 6-field Cron expression describing the install maintenance schedule. Must not overlap with variable scan\_schedule. | `string` | `"cron(0 3 * * ? *)"` | no | From 46c4d80608166093c04df8d21f11e2af1a901ea1 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 6 Dec 2024 12:33:04 +0000 Subject: [PATCH 28/29] terraform-docs: automated action --- terraform.tfvars.example | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/terraform.tfvars.example b/terraform.tfvars.example index 6f8f4d1..87a57f5 100644 --- a/terraform.tfvars.example +++ b/terraform.tfvars.example @@ -65,7 +65,9 @@ gpuNodeSize = [ # By setting key 'enable' to 'true', GPU operator will be deployed. # 'helm_repository' is an URL for the repository of the GPU operator helm chart, where 'helm_version' is its respective version of a chart. # 'chart_values' is used for changing default values.yaml of the GPU operator chart. -gpu_operator_config = +gpu_operator_config = { + "enable": false +} # The name of the infrastructure. e.g. simphera-infra infrastructurename = "simphera" From 923b94844c3e0b8411e35e2ecf9260df916a585b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 6 Dec 2024 12:33:05 +0000 Subject: [PATCH 29/29] terraform-docs: automated action --- terraform.json.example | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/terraform.json.example b/terraform.json.example index ba19074..e7b13bd 100644 --- a/terraform.json.example +++ b/terraform.json.example @@ -21,7 +21,9 @@ "gpuNodeSize": [ "g5.2xlarge" ], - "gpu_operator_config": null, + "gpu_operator_config": { + "enable": false + }, "infrastructurename": "simphera", "ingress_nginx_config": { "enable": false