Skip to content

Commit

Permalink
Merge branch 'main' into eks/gpu_changes_main
Browse files Browse the repository at this point in the history
  • Loading branch information
vradicevicds committed Dec 12, 2024
2 parents 80f874b + 5f7fbe4 commit 6fc8294
Show file tree
Hide file tree
Showing 8 changed files with 105 additions and 14 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -554,7 +554,7 @@ Encryption is enabled at all AWS resources that are created by Terraform:
| <a name="input_gpuNodeDiskSize"></a> [gpuNodeDiskSize](#input\_gpuNodeDiskSize) | The disk size in GiB of the nodes for the gpu job execution | `number` | `100` | no |
| <a name="input_gpuNodePool"></a> [gpuNodePool](#input\_gpuNodePool) | Specifies whether an additional node pool for gpu job execution is added to the kubernetes cluster | `bool` | `false` | no |
| <a name="input_gpuNodeSize"></a> [gpuNodeSize](#input\_gpuNodeSize) | The machine size of the nodes for the gpu job execution | `list(string)` | <pre>[<br> "g5.2xlarge"<br>]</pre> | no |
| <a name="input_gpuNvidiaDriverVersion"></a> [gpuNvidiaDriverVersion](#input\_gpuNvidiaDriverVersion) | The NVIDIA driver version for GPU node group. | `string` | `"535.54.03"` | no |
| <a name="input_gpu_operator_config"></a> [gpu\_operator\_config](#input\_gpu\_operator\_config) | Input configuration for the GPU operator chart deployed with helm release. By setting key 'enable' to 'true', GPU operator will be deployed. 'helm\_repository' is an URL for the repository of the GPU operator helm chart, where 'helm\_version' is its respective version of a chart. 'chart\_values' is used for changing default values.yaml of the GPU operator chart. | <pre>object({<br> enable = optional(bool, true)<br> helm_repository = optional(string, "https://helm.ngc.nvidia.com/nvidia")<br> helm_version = optional(string, "v24.9.0")<br> driver_version = optional(string, "550.90.07")<br> chart_values = optional(string, <<-YAML<br>operator:<br> defaultRuntime: containerd<br><br>dcgmExporter:<br> enabled: false<br><br>driver:<br> enabled: true<br><br>validator:<br> driver:<br> env:<br> - name: DISABLE_DEV_CHAR_SYMLINK_CREATION<br> value: "true"<br><br>toolkit:<br> enabled: true<br><br>daemonsets:<br> tolerations:<br> - key: purpose<br> value: gpu<br> operator: Equal<br> effect: NoSchedule<br><br>node-feature-discovery:<br> worker:<br> tolerations:<br> - key: purpose<br> value: gpu<br> operator: Equal<br> effect: NoSchedule<br>YAML<br> )<br> })</pre> | <pre>{<br> "enable": false<br>}</pre> | no |
| <a name="input_infrastructurename"></a> [infrastructurename](#input\_infrastructurename) | The name of the infrastructure. e.g. simphera-infra | `string` | `"simphera"` | no |
| <a name="input_ingress_nginx_config"></a> [ingress\_nginx\_config](#input\_ingress\_nginx\_config) | Input configuration for ingress-nginx service deployed with helm release. By setting key 'enable' to 'true', ingress-nginx service will be deployed. 'helm\_repository' is an URL for the repository of ingress-nginx helm chart, where 'helm\_version' is its respective version of a chart. 'chart\_values' is used for changing default values.yaml of an ingress-nginx chart. | <pre>object({<br> enable = bool<br> helm_repository = optional(string, "https://kubernetes.github.io/ingress-nginx")<br> helm_version = optional(string, "4.1.4")<br> chart_values = optional(string, <<-YAML<br>controller:<br> images:<br> registry: "registry.k8s.io"<br> service:<br> annotations:<br> service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing<br>YAML<br> )<br> })</pre> | <pre>{<br> "enable": false<br>}</pre> | no |
| <a name="input_install_schedule"></a> [install\_schedule](#input\_install\_schedule) | 6-field Cron expression describing the install maintenance schedule. Must not overlap with variable scan\_schedule. | `string` | `"cron(0 3 * * ? *)"` | no |
Expand Down
1 change: 1 addition & 0 deletions k8s-eks-addons.tf
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# coredns_config = var.coredns_config
# s3_csi_config = var.s3_csi_config
# aws_load_balancer_controller_config = var.aws_load_balancer_controller_config
# gpu_operator_config = var.gpu_operator_config

# addon_context = {
# aws_caller_identity_account_id = data.aws_caller_identity.current.account_id
Expand Down
17 changes: 13 additions & 4 deletions locals.tf
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ data "aws_ami" "al2gpu_ami" {
most_recent = true
filter {
name = "name"
values = ["*amazon-eks-gpu-node-${var.kubernetesVersion}*"]
values = ["*ubuntu-eks/k8s_${var.kubernetesVersion}/images/hvm-ssd/ubuntu-jammy-22.04-amd64-server*"]
}
}

Expand All @@ -30,7 +30,6 @@ locals {
private_subnets = local.create_vpc ? module.vpc[0].private_subnets : (local.use_private_subnets_ids ? var.private_subnet_ids : [for s in data.aws_subnet.private_subnet : s.id])
public_subnets = local.create_vpc ? module.vpc[0].public_subnets : (local.use_public_subnet_ids ? var.public_subnet_ids : [for s in data.aws_subnet.public_subnet : s.id])
# Using a one-line command for gpuPostUserData to avoid issues due to different line endings between Windows and Linux.
gpuPostUserData = "sudo yum -y erase nvidia-driver \nsudo yum -y install make gcc \nsudo yum -y update \nsudo yum -y install gcc kernel-devel-$(uname -r) \nsudo curl -fSsl -O https://us.download.nvidia.com/tesla/${var.gpuNvidiaDriverVersion}/NVIDIA-Linux-x86_64-${var.gpuNvidiaDriverVersion}.run \nsudo chmod +x NVIDIA-Linux-x86_64*.run \nsudo CC=/usr/bin/gcc10-cc ./NVIDIA-Linux-x86_64*.run -s --no-dkms --install-libglvnd \nsudo touch /etc/modprobe.d/nvidia.conf \necho \"options nvidia NVreg_EnableGpuFirmware=0\" | sudo tee --append /etc/modprobe.d/nvidia.conf \nsudo reboot"

default_managed_node_pools = {
"default" = {
Expand Down Expand Up @@ -74,7 +73,12 @@ locals {
disk_size = var.gpuNodeDiskSize
custom_ami_id = data.aws_ami.al2gpu_ami.image_id
create_launch_template = true
post_userdata = local.gpuPostUserData
block_device_mappings = [{
device_name = "/dev/sda1"
volume_type = "gp3"
volume_size = 128
delete_on_termination = true
}]
k8s_labels = {
"purpose" = "gpu"
}
Expand All @@ -99,7 +103,12 @@ locals {
disk_size = var.ivsGpuNodeDiskSize
custom_ami_id = data.aws_ami.al2gpu_ami.image_id
create_launch_template = true
post_userdata = local.gpuPostUserData
block_device_mappings = [{
device_name = "/dev/sda1"
volume_type = "gp3"
volume_size = 128
delete_on_termination = true
}]
k8s_labels = {
"product" = "ivs",
"purpose" = "gpu"
Expand Down
21 changes: 21 additions & 0 deletions modules/k8s_eks_addons/gpu-operator.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
resource "helm_release" "gpu_operator" {
count = var.gpu_operator_config.enable ? 1 : 0

namespace = "kube-system"
name = "gpu-operator"
chart = "gpu-operator"
create_namespace = true
repository = var.gpu_operator_config.helm_repository
version = var.gpu_operator_config.helm_version
description = "The GPU operator HelmChart deployment configuration"
dependency_update = true
values = [
var.gpu_operator_config.chart_values
]
timeout = 1200
set {
name = "driver.version"
value = var.gpu_operator_config.driver_version
}

}
11 changes: 11 additions & 0 deletions modules/k8s_eks_addons/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,14 @@ variable "s3_csi_config" {
})
description = "Input configuration for AWS EKS add-on aws-mountpoint-s3-csi-driver."
}

variable "gpu_operator_config" {
description = "GPU operator configuration"
type = object({
enable = bool
helm_repository = string
helm_version = string
chart_values = string
driver_version = string
})
}
4 changes: 3 additions & 1 deletion terraform.json.example
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@
"gpuNodeSize": [
"g5.2xlarge"
],
"gpuNvidiaDriverVersion": "535.54.03",
"gpu_operator_config": {
"enable": false
},
"infrastructurename": "simphera",
"ingress_nginx_config": {
"enable": false
Expand Down
9 changes: 7 additions & 2 deletions terraform.tfvars.example
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,13 @@ gpuNodeSize = [
"g5.2xlarge"
]

# The NVIDIA driver version for GPU node group.
gpuNvidiaDriverVersion = "535.54.03"
# Input configuration for the GPU operator chart deployed with helm release.
# By setting key 'enable' to 'true', GPU operator will be deployed.
# 'helm_repository' is an URL for the repository of the GPU operator helm chart, where 'helm_version' is its respective version of a chart.
# 'chart_values' is used for changing default values.yaml of the GPU operator chart.
gpu_operator_config = {
"enable": false
}

# The name of the infrastructure. e.g. simphera-infra
infrastructurename = "simphera"
Expand Down
54 changes: 48 additions & 6 deletions variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -118,12 +118,6 @@ variable "ivsGpuNodeDiskSize" {
default = 100
}

variable "gpuNvidiaDriverVersion" {
type = string
description = "The NVIDIA driver version for GPU node group."
default = "535.54.03"
}

variable "licenseServer" {
type = bool
description = "Specifies whether a license server VM will be created."
Expand Down Expand Up @@ -375,3 +369,51 @@ variable "aws_load_balancer_controller_config" {
enable = false
}
}

variable "gpu_operator_config" {
type = object({
enable = optional(bool, true)
helm_repository = optional(string, "https://helm.ngc.nvidia.com/nvidia")
helm_version = optional(string, "v24.9.0")
driver_version = optional(string, "550.90.07")
chart_values = optional(string, <<-YAML
operator:
defaultRuntime: containerd
dcgmExporter:
enabled: false
driver:
enabled: true
validator:
driver:
env:
- name: DISABLE_DEV_CHAR_SYMLINK_CREATION
value: "true"
toolkit:
enabled: true
daemonsets:
tolerations:
- key: purpose
value: gpu
operator: Equal
effect: NoSchedule
node-feature-discovery:
worker:
tolerations:
- key: purpose
value: gpu
operator: Equal
effect: NoSchedule
YAML
)
})
description = "Input configuration for the GPU operator chart deployed with helm release. By setting key 'enable' to 'true', GPU operator will be deployed. 'helm_repository' is an URL for the repository of the GPU operator helm chart, where 'helm_version' is its respective version of a chart. 'chart_values' is used for changing default values.yaml of the GPU operator chart."
default = {
enable = false
}
}

0 comments on commit 6fc8294

Please sign in to comment.