From 9ec1d476d3b4a1066fb4c60db00aeca2f8b18737 Mon Sep 17 00:00:00 2001 From: Bryant Biggs Date: Mon, 14 Oct 2024 14:36:41 +0000 Subject: [PATCH] chore: Update GPU patterns to use new AL2023 NVIDIA AMI variant and latest EKS 1.31 (#2031) --- .pre-commit-config.yaml | 4 +- patterns/fargate-serverless/README.md | 2 +- patterns/ml-capacity-block/README.md | 2 +- patterns/ml-capacity-block/eks.tf | 77 ++++++++++++------- patterns/ml-capacity-block/helm.tf | 18 +---- patterns/ml-capacity-block/main.tf | 4 +- patterns/nvidia-gpu-efa/README.md | 4 +- patterns/nvidia-gpu-efa/eks.tf | 33 +++++--- .../nvidia-gpu-efa/generate-efa-nccl-test.sh | 2 +- patterns/nvidia-gpu-efa/helm.tf | 18 +---- patterns/nvidia-gpu-efa/main.tf | 4 +- patterns/targeted-odcr/README.md | 2 +- patterns/targeted-odcr/eks.tf | 37 ++++++--- patterns/targeted-odcr/helm.tf | 18 +---- patterns/targeted-odcr/main.tf | 4 +- 15 files changed, 118 insertions(+), 111 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9102f20bc3..8ef3f7f710 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/streetsidesoftware/cspell-cli - rev: v8.13.3 + rev: v8.15.1 hooks: - id: cspell args: [--exclude, 'ADOPTERS.md', --exclude, '.pre-commit-config.yaml', --exclude, '.gitignore', --exclude, '*.drawio', --exclude, 'mkdocs.yml', --exclude, '.helmignore', --exclude, '.github/workflows/*', --exclude, 'patterns/istio-multi-cluster/*', --exclude, 'patterns/blue-green-upgrade/*', --exclude, '/patterns/vpc-lattice/cross-cluster-pod-communication/*', --exclude, 'patterns/bottlerocket/*', --exclude, 'patterns/nvidia-gpu-efa/generate-efa-nccl-test.sh'] @@ -10,7 +10,7 @@ repos: - id: pretty-format-yaml args: [--autofix, --indent, '2', --offset, '2', --preserve-quotes] - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.6.0 + rev: v5.0.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer diff --git a/patterns/fargate-serverless/README.md b/patterns/fargate-serverless/README.md index 615da30e0a..1f35433a79 100644 --- a/patterns/fargate-serverless/README.md +++ b/patterns/fargate-serverless/README.md @@ -45,7 +45,7 @@ See [here](https://aws-ia.github.io/terraform-aws-eks-blueprints/getting-started 3. Validate the `aws-logging` configMap for Fargate Fluentbit was created: ```sh - kubectl -n aws-observability get configmap aws-logging + kubectl -n aws-observability get configmap aws-logging ``` ```yaml diff --git a/patterns/ml-capacity-block/README.md b/patterns/ml-capacity-block/README.md index c1f8519a36..40b1daef05 100644 --- a/patterns/ml-capacity-block/README.md +++ b/patterns/ml-capacity-block/README.md @@ -13,7 +13,7 @@ This pattern demonstrates how to consume/utilize ML capacity block reservations ## Code -```terraform hl_lines="5-11 80-94 106-109 138-151" +```terraform hl_lines="5-11 93-107 119-122 161-174" {% include "../../patterns/ml-capacity-block/eks.tf" %} ``` diff --git a/patterns/ml-capacity-block/eks.tf b/patterns/ml-capacity-block/eks.tf index 816e2b8890..869f8073f2 100644 --- a/patterns/ml-capacity-block/eks.tf +++ b/patterns/ml-capacity-block/eks.tf @@ -16,10 +16,10 @@ variable "capacity_reservation_id" { module "eks" { source = "terraform-aws-modules/eks/aws" - version = "~> 20.17" + version = "~> 20.26" cluster_name = local.name - cluster_version = "1.30" + cluster_version = "1.31" # Give the Terraform identity admin access to the cluster # which will allow it to deploy resources into the cluster @@ -30,7 +30,9 @@ module "eks" { coredns = {} eks-pod-identity-agent = {} kube-proxy = {} - vpc-cni = {} + vpc-cni = { + most_recent = true + } } # Add security group rules on the node group security group to @@ -42,16 +44,27 @@ module "eks" { eks_managed_node_groups = { cbr = { - # The EKS AL2 GPU AMI provides all of the necessary components + # The EKS AL2023 NVIDIA AMI provides all of the necessary components # for accelerated workloads w/ EFA - ami_type = "AL2_x86_64_GPU" - instance_types = ["p5.48xlarge"] - - pre_bootstrap_user_data = <<-EOT - # Mount instance store volumes in RAID-0 for kubelet and containerd - # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0 - /bin/setup-local-disks raid0 - EOT + ami_type = "AL2023_x86_64_NVIDIA" + instance_types = ["p5e.48xlarge"] + + # Mount instance store volumes in RAID-0 for kubelet and containerd + # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0 + cloudinit_pre_nodeadm = [ + { + content_type = "application/node.eks.aws" + content = <<-EOT + --- + apiVersion: node.eks.aws/v1alpha1 + kind: NodeConfig + spec: + instance: + localStorage: + strategy: RAID0 + EOT + } + ] min_size = 2 max_size = 2 @@ -97,7 +110,7 @@ module "eks" { default = { instance_types = ["m5.large"] - min_size = 1 + min_size = 2 max_size = 2 desired_size = 2 } @@ -109,21 +122,31 @@ module "eks" { # the one that works for their use case. self_managed_node_groups = { cbr2 = { - # The EKS AL2 GPU AMI provides all of the necessary components + # The EKS AL2023 NVIDIA AMI provides all of the necessary components # for accelerated workloads w/ EFA - ami_type = "AL2_x86_64_GPU" - instance_type = "p5.48xlarge" - - pre_bootstrap_user_data = <<-EOT - # Mount instance store volumes in RAID-0 for kubelet and containerd - # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0 - /bin/setup-local-disks raid0 - - # Ensure only GPU workloads are scheduled on this node group - export KUBELET_EXTRA_ARGS='--node-labels=vpc.amazonaws.com/efa.present=true,nvidia.com/gpu.present=true \ - --register-with-taints=nvidia.com/gpu=true:NoSchedule' - - EOT + ami_type = "AL2023_x86_64_NVIDIA" + instance_type = "p5e.48xlarge" + + # Mount instance store volumes in RAID-0 for kubelet and containerd + # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0 + cloudinit_pre_nodeadm = [ + { + content_type = "application/node.eks.aws" + content = <<-EOT + --- + apiVersion: node.eks.aws/v1alpha1 + kind: NodeConfig + spec: + instance: + localStorage: + strategy: RAID0 + kubelet: + flags: + - --node-labels=vpc.amazonaws.com/efa.present=true,nvidia.com/gpu.present=true + - --register-with-taints=nvidia.com/gpu=true:NoSchedule + EOT + } + ] min_size = 2 max_size = 2 diff --git a/patterns/ml-capacity-block/helm.tf b/patterns/ml-capacity-block/helm.tf index 7482ab3b8e..588ba0547f 100644 --- a/patterns/ml-capacity-block/helm.tf +++ b/patterns/ml-capacity-block/helm.tf @@ -6,31 +6,17 @@ resource "helm_release" "nvidia_device_plugin" { name = "nvidia-device-plugin" repository = "https://nvidia.github.io/k8s-device-plugin" chart = "nvidia-device-plugin" - version = "0.14.5" + version = "0.16.2" namespace = "nvidia-device-plugin" create_namespace = true wait = false - - values = [ - <<-EOT - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: 'nvidia.com/gpu.present' - operator: In - values: - - 'true' - EOT - ] } resource "helm_release" "aws_efa_device_plugin" { name = "aws-efa-k8s-device-plugin" repository = "https://aws.github.io/eks-charts" chart = "aws-efa-k8s-device-plugin" - version = "v0.5.2" + version = "v0.5.5" namespace = "kube-system" wait = false diff --git a/patterns/ml-capacity-block/main.tf b/patterns/ml-capacity-block/main.tf index 5f4e3e16ba..cb9d8c6807 100644 --- a/patterns/ml-capacity-block/main.tf +++ b/patterns/ml-capacity-block/main.tf @@ -4,11 +4,11 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.57" + version = ">= 5.70" } helm = { source = "hashicorp/helm" - version = ">= 2.9" + version = ">= 2.16" } } diff --git a/patterns/nvidia-gpu-efa/README.md b/patterns/nvidia-gpu-efa/README.md index f8f92bbf9e..c3ac62368f 100644 --- a/patterns/nvidia-gpu-efa/README.md +++ b/patterns/nvidia-gpu-efa/README.md @@ -17,11 +17,11 @@ The following components are demonstrated in this pattern: ## Code -```terraform hl_lines="24-26 32-67" +```terraform hl_lines="26-28 34-80" {% include "../../patterns/nvidia-gpu-efa/eks.tf" %} ``` -```terraform hl_lines="5-47" +```terraform hl_lines="5-33" {% include "../../patterns/nvidia-gpu-efa/helm.tf" %} ``` diff --git a/patterns/nvidia-gpu-efa/eks.tf b/patterns/nvidia-gpu-efa/eks.tf index 94da0aed3d..7b0b282258 100644 --- a/patterns/nvidia-gpu-efa/eks.tf +++ b/patterns/nvidia-gpu-efa/eks.tf @@ -4,10 +4,10 @@ module "eks" { source = "terraform-aws-modules/eks/aws" - version = "~> 20.17" + version = "~> 20.26" cluster_name = local.name - cluster_version = "1.30" + cluster_version = "1.31" # Give the Terraform identity admin access to the cluster # which will allow it to deploy resources into the cluster @@ -18,7 +18,9 @@ module "eks" { coredns = {} eks-pod-identity-agent = {} kube-proxy = {} - vpc-cni = {} + vpc-cni = { + most_recent = true + } } # Add security group rules on the node group security group to @@ -30,16 +32,27 @@ module "eks" { eks_managed_node_groups = { nvidia-efa = { - # The EKS AL2 GPU AMI provides all of the necessary components + # The EKS AL2023 NVIDIA AMI provides all of the necessary components # for accelerated workloads w/ EFA - ami_type = "AL2_x86_64_GPU" + ami_type = "AL2023_x86_64_NVIDIA" instance_types = ["p5.48xlarge"] - pre_bootstrap_user_data = <<-EOT - # Mount instance store volumes in RAID-0 for kubelet and containerd - # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0 - /bin/setup-local-disks raid0 - EOT + # Mount instance store volumes in RAID-0 for kubelet and containerd + # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0 + cloudinit_pre_nodeadm = [ + { + content_type = "application/node.eks.aws" + content = <<-EOT + --- + apiVersion: node.eks.aws/v1alpha1 + kind: NodeConfig + spec: + instance: + localStorage: + strategy: RAID0 + EOT + } + ] min_size = 2 max_size = 2 diff --git a/patterns/nvidia-gpu-efa/generate-efa-nccl-test.sh b/patterns/nvidia-gpu-efa/generate-efa-nccl-test.sh index 5a376f0799..f58ae56ffb 100755 --- a/patterns/nvidia-gpu-efa/generate-efa-nccl-test.sh +++ b/patterns/nvidia-gpu-efa/generate-efa-nccl-test.sh @@ -2,7 +2,7 @@ export MPI_JOB_NAME=efa-nccl-test export IMAGE_URI=public.ecr.aws/hpc-cloud/nccl-tests:latest -export INSTANCE_TYPE=p5.48xlarge +export INSTANCE_TYPE=p5e.48xlarge export NUM_WORKERS=2 export GPU_PER_WORKER=8 export EFA_PER_WORKER=32 diff --git a/patterns/nvidia-gpu-efa/helm.tf b/patterns/nvidia-gpu-efa/helm.tf index 7482ab3b8e..588ba0547f 100644 --- a/patterns/nvidia-gpu-efa/helm.tf +++ b/patterns/nvidia-gpu-efa/helm.tf @@ -6,31 +6,17 @@ resource "helm_release" "nvidia_device_plugin" { name = "nvidia-device-plugin" repository = "https://nvidia.github.io/k8s-device-plugin" chart = "nvidia-device-plugin" - version = "0.14.5" + version = "0.16.2" namespace = "nvidia-device-plugin" create_namespace = true wait = false - - values = [ - <<-EOT - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: 'nvidia.com/gpu.present' - operator: In - values: - - 'true' - EOT - ] } resource "helm_release" "aws_efa_device_plugin" { name = "aws-efa-k8s-device-plugin" repository = "https://aws.github.io/eks-charts" chart = "aws-efa-k8s-device-plugin" - version = "v0.5.2" + version = "v0.5.5" namespace = "kube-system" wait = false diff --git a/patterns/nvidia-gpu-efa/main.tf b/patterns/nvidia-gpu-efa/main.tf index 5c797a6d0b..2c28779d11 100644 --- a/patterns/nvidia-gpu-efa/main.tf +++ b/patterns/nvidia-gpu-efa/main.tf @@ -4,11 +4,11 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.34" + version = ">= 5.70" } helm = { source = "hashicorp/helm" - version = ">= 2.9" + version = ">= 2.16" } } diff --git a/patterns/targeted-odcr/README.md b/patterns/targeted-odcr/README.md index 7b7a3596d6..41694edafa 100644 --- a/patterns/targeted-odcr/README.md +++ b/patterns/targeted-odcr/README.md @@ -18,7 +18,7 @@ This pattern demonstrates how to consume/utilize on-demand capacity reservations ## Code -```terraform hl_lines="5-8 81-88 108-131" +```terraform hl_lines="5-8 94-104 124-147" {% include "../../patterns/targeted-odcr/eks.tf" %} ``` diff --git a/patterns/targeted-odcr/eks.tf b/patterns/targeted-odcr/eks.tf index 2945d8bb0e..f10d403ef2 100644 --- a/patterns/targeted-odcr/eks.tf +++ b/patterns/targeted-odcr/eks.tf @@ -13,10 +13,10 @@ variable "capacity_reservation_arns" { module "eks" { source = "terraform-aws-modules/eks/aws" - version = "~> 20.17" + version = "~> 20.26" cluster_name = local.name - cluster_version = "1.30" + cluster_version = "1.31" # Give the Terraform identity admin access to the cluster # which will allow it to deploy resources into the cluster @@ -27,7 +27,9 @@ module "eks" { coredns = {} eks-pod-identity-agent = {} kube-proxy = {} - vpc-cni = {} + vpc-cni = { + most_recent = true + } } # Add security group rules on the node group security group to @@ -39,16 +41,27 @@ module "eks" { eks_managed_node_groups = { odcr = { - # The EKS AL2 GPU AMI provides all of the necessary components + # The EKS AL2023 NVIDIA AMI provides all of the necessary components # for accelerated workloads w/ EFA - ami_type = "AL2_x86_64_GPU" - instance_type = "p5.48xlarge" - - pre_bootstrap_user_data = <<-EOT - # Mount instance store volumes in RAID-0 for kubelet and containerd - # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0 - /bin/setup-local-disks raid0 - EOT + ami_type = "AL2023_x86_64_NVIDIA" + instance_types = ["p5.48xlarge"] + + # Mount instance store volumes in RAID-0 for kubelet and containerd + # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0 + cloudinit_pre_nodeadm = [ + { + content_type = "application/node.eks.aws" + content = <<-EOT + --- + apiVersion: node.eks.aws/v1alpha1 + kind: NodeConfig + spec: + instance: + localStorage: + strategy: RAID0 + EOT + } + ] min_size = 2 max_size = 2 diff --git a/patterns/targeted-odcr/helm.tf b/patterns/targeted-odcr/helm.tf index 7482ab3b8e..588ba0547f 100644 --- a/patterns/targeted-odcr/helm.tf +++ b/patterns/targeted-odcr/helm.tf @@ -6,31 +6,17 @@ resource "helm_release" "nvidia_device_plugin" { name = "nvidia-device-plugin" repository = "https://nvidia.github.io/k8s-device-plugin" chart = "nvidia-device-plugin" - version = "0.14.5" + version = "0.16.2" namespace = "nvidia-device-plugin" create_namespace = true wait = false - - values = [ - <<-EOT - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: 'nvidia.com/gpu.present' - operator: In - values: - - 'true' - EOT - ] } resource "helm_release" "aws_efa_device_plugin" { name = "aws-efa-k8s-device-plugin" repository = "https://aws.github.io/eks-charts" chart = "aws-efa-k8s-device-plugin" - version = "v0.5.2" + version = "v0.5.5" namespace = "kube-system" wait = false diff --git a/patterns/targeted-odcr/main.tf b/patterns/targeted-odcr/main.tf index 9eec67edcd..51270d504e 100644 --- a/patterns/targeted-odcr/main.tf +++ b/patterns/targeted-odcr/main.tf @@ -4,11 +4,11 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 5.34" + version = ">= 5.70" } helm = { source = "hashicorp/helm" - version = ">= 2.9" + version = ">= 2.16" } }