Skip to content

Commit

Permalink
chore: Update GPU patterns to use new AL2023 NVIDIA AMI variant and l…
Browse files Browse the repository at this point in the history
…atest EKS 1.31 (#2031)
  • Loading branch information
bryantbiggs authored Oct 14, 2024
1 parent e7863cf commit 9ec1d47
Show file tree
Hide file tree
Showing 15 changed files with 118 additions and 111 deletions.
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
repos:
- repo: https://github.com/streetsidesoftware/cspell-cli
rev: v8.13.3
rev: v8.15.1
hooks:
- id: cspell
args: [--exclude, 'ADOPTERS.md', --exclude, '.pre-commit-config.yaml', --exclude, '.gitignore', --exclude, '*.drawio', --exclude, 'mkdocs.yml', --exclude, '.helmignore', --exclude, '.github/workflows/*', --exclude, 'patterns/istio-multi-cluster/*', --exclude, 'patterns/blue-green-upgrade/*', --exclude, '/patterns/vpc-lattice/cross-cluster-pod-communication/*', --exclude, 'patterns/bottlerocket/*', --exclude, 'patterns/nvidia-gpu-efa/generate-efa-nccl-test.sh']
Expand All @@ -10,7 +10,7 @@ repos:
- id: pretty-format-yaml
args: [--autofix, --indent, '2', --offset, '2', --preserve-quotes]
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
rev: v5.0.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
Expand Down
2 changes: 1 addition & 1 deletion patterns/fargate-serverless/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ See [here](https://aws-ia.github.io/terraform-aws-eks-blueprints/getting-started
3. Validate the `aws-logging` configMap for Fargate Fluentbit was created:

```sh
kubectl -n aws-observability get configmap aws-logging
kubectl -n aws-observability get configmap aws-logging
```

```yaml
Expand Down
2 changes: 1 addition & 1 deletion patterns/ml-capacity-block/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ This pattern demonstrates how to consume/utilize ML capacity block reservations

## Code

```terraform hl_lines="5-11 80-94 106-109 138-151"
```terraform hl_lines="5-11 93-107 119-122 161-174"
{% include "../../patterns/ml-capacity-block/eks.tf" %}
```

Expand Down
77 changes: 50 additions & 27 deletions patterns/ml-capacity-block/eks.tf
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ variable "capacity_reservation_id" {

module "eks" {
source = "terraform-aws-modules/eks/aws"
version = "~> 20.17"
version = "~> 20.26"

cluster_name = local.name
cluster_version = "1.30"
cluster_version = "1.31"

# Give the Terraform identity admin access to the cluster
# which will allow it to deploy resources into the cluster
Expand All @@ -30,7 +30,9 @@ module "eks" {
coredns = {}
eks-pod-identity-agent = {}
kube-proxy = {}
vpc-cni = {}
vpc-cni = {
most_recent = true
}
}

# Add security group rules on the node group security group to
Expand All @@ -42,16 +44,27 @@ module "eks" {

eks_managed_node_groups = {
cbr = {
# The EKS AL2 GPU AMI provides all of the necessary components
# The EKS AL2023 NVIDIA AMI provides all of the necessary components
# for accelerated workloads w/ EFA
ami_type = "AL2_x86_64_GPU"
instance_types = ["p5.48xlarge"]

pre_bootstrap_user_data = <<-EOT
# Mount instance store volumes in RAID-0 for kubelet and containerd
# https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
/bin/setup-local-disks raid0
EOT
ami_type = "AL2023_x86_64_NVIDIA"
instance_types = ["p5e.48xlarge"]

# Mount instance store volumes in RAID-0 for kubelet and containerd
# https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
cloudinit_pre_nodeadm = [
{
content_type = "application/node.eks.aws"
content = <<-EOT
---
apiVersion: node.eks.aws/v1alpha1
kind: NodeConfig
spec:
instance:
localStorage:
strategy: RAID0
EOT
}
]

min_size = 2
max_size = 2
Expand Down Expand Up @@ -97,7 +110,7 @@ module "eks" {
default = {
instance_types = ["m5.large"]

min_size = 1
min_size = 2
max_size = 2
desired_size = 2
}
Expand All @@ -109,21 +122,31 @@ module "eks" {
# the one that works for their use case.
self_managed_node_groups = {
cbr2 = {
# The EKS AL2 GPU AMI provides all of the necessary components
# The EKS AL2023 NVIDIA AMI provides all of the necessary components
# for accelerated workloads w/ EFA
ami_type = "AL2_x86_64_GPU"
instance_type = "p5.48xlarge"

pre_bootstrap_user_data = <<-EOT
# Mount instance store volumes in RAID-0 for kubelet and containerd
# https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
/bin/setup-local-disks raid0
# Ensure only GPU workloads are scheduled on this node group
export KUBELET_EXTRA_ARGS='--node-labels=vpc.amazonaws.com/efa.present=true,nvidia.com/gpu.present=true \
--register-with-taints=nvidia.com/gpu=true:NoSchedule'
EOT
ami_type = "AL2023_x86_64_NVIDIA"
instance_type = "p5e.48xlarge"

# Mount instance store volumes in RAID-0 for kubelet and containerd
# https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
cloudinit_pre_nodeadm = [
{
content_type = "application/node.eks.aws"
content = <<-EOT
---
apiVersion: node.eks.aws/v1alpha1
kind: NodeConfig
spec:
instance:
localStorage:
strategy: RAID0
kubelet:
flags:
- --node-labels=vpc.amazonaws.com/efa.present=true,nvidia.com/gpu.present=true
- --register-with-taints=nvidia.com/gpu=true:NoSchedule
EOT
}
]

min_size = 2
max_size = 2
Expand Down
18 changes: 2 additions & 16 deletions patterns/ml-capacity-block/helm.tf
Original file line number Diff line number Diff line change
Expand Up @@ -6,31 +6,17 @@ resource "helm_release" "nvidia_device_plugin" {
name = "nvidia-device-plugin"
repository = "https://nvidia.github.io/k8s-device-plugin"
chart = "nvidia-device-plugin"
version = "0.14.5"
version = "0.16.2"
namespace = "nvidia-device-plugin"
create_namespace = true
wait = false

values = [
<<-EOT
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: 'nvidia.com/gpu.present'
operator: In
values:
- 'true'
EOT
]
}

resource "helm_release" "aws_efa_device_plugin" {
name = "aws-efa-k8s-device-plugin"
repository = "https://aws.github.io/eks-charts"
chart = "aws-efa-k8s-device-plugin"
version = "v0.5.2"
version = "v0.5.5"
namespace = "kube-system"
wait = false

Expand Down
4 changes: 2 additions & 2 deletions patterns/ml-capacity-block/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 5.57"
version = ">= 5.70"
}
helm = {
source = "hashicorp/helm"
version = ">= 2.9"
version = ">= 2.16"
}
}

Expand Down
4 changes: 2 additions & 2 deletions patterns/nvidia-gpu-efa/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@ The following components are demonstrated in this pattern:

## Code

```terraform hl_lines="24-26 32-67"
```terraform hl_lines="26-28 34-80"
{% include "../../patterns/nvidia-gpu-efa/eks.tf" %}
```

```terraform hl_lines="5-47"
```terraform hl_lines="5-33"
{% include "../../patterns/nvidia-gpu-efa/helm.tf" %}
```

Expand Down
33 changes: 23 additions & 10 deletions patterns/nvidia-gpu-efa/eks.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@

module "eks" {
source = "terraform-aws-modules/eks/aws"
version = "~> 20.17"
version = "~> 20.26"

cluster_name = local.name
cluster_version = "1.30"
cluster_version = "1.31"

# Give the Terraform identity admin access to the cluster
# which will allow it to deploy resources into the cluster
Expand All @@ -18,7 +18,9 @@ module "eks" {
coredns = {}
eks-pod-identity-agent = {}
kube-proxy = {}
vpc-cni = {}
vpc-cni = {
most_recent = true
}
}

# Add security group rules on the node group security group to
Expand All @@ -30,16 +32,27 @@ module "eks" {

eks_managed_node_groups = {
nvidia-efa = {
# The EKS AL2 GPU AMI provides all of the necessary components
# The EKS AL2023 NVIDIA AMI provides all of the necessary components
# for accelerated workloads w/ EFA
ami_type = "AL2_x86_64_GPU"
ami_type = "AL2023_x86_64_NVIDIA"
instance_types = ["p5.48xlarge"]

pre_bootstrap_user_data = <<-EOT
# Mount instance store volumes in RAID-0 for kubelet and containerd
# https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
/bin/setup-local-disks raid0
EOT
# Mount instance store volumes in RAID-0 for kubelet and containerd
# https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
cloudinit_pre_nodeadm = [
{
content_type = "application/node.eks.aws"
content = <<-EOT
---
apiVersion: node.eks.aws/v1alpha1
kind: NodeConfig
spec:
instance:
localStorage:
strategy: RAID0
EOT
}
]

min_size = 2
max_size = 2
Expand Down
2 changes: 1 addition & 1 deletion patterns/nvidia-gpu-efa/generate-efa-nccl-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

export MPI_JOB_NAME=efa-nccl-test
export IMAGE_URI=public.ecr.aws/hpc-cloud/nccl-tests:latest
export INSTANCE_TYPE=p5.48xlarge
export INSTANCE_TYPE=p5e.48xlarge
export NUM_WORKERS=2
export GPU_PER_WORKER=8
export EFA_PER_WORKER=32
Expand Down
18 changes: 2 additions & 16 deletions patterns/nvidia-gpu-efa/helm.tf
Original file line number Diff line number Diff line change
Expand Up @@ -6,31 +6,17 @@ resource "helm_release" "nvidia_device_plugin" {
name = "nvidia-device-plugin"
repository = "https://nvidia.github.io/k8s-device-plugin"
chart = "nvidia-device-plugin"
version = "0.14.5"
version = "0.16.2"
namespace = "nvidia-device-plugin"
create_namespace = true
wait = false

values = [
<<-EOT
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: 'nvidia.com/gpu.present'
operator: In
values:
- 'true'
EOT
]
}

resource "helm_release" "aws_efa_device_plugin" {
name = "aws-efa-k8s-device-plugin"
repository = "https://aws.github.io/eks-charts"
chart = "aws-efa-k8s-device-plugin"
version = "v0.5.2"
version = "v0.5.5"
namespace = "kube-system"
wait = false

Expand Down
4 changes: 2 additions & 2 deletions patterns/nvidia-gpu-efa/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = ">= 5.34"
version = ">= 5.70"
}
helm = {
source = "hashicorp/helm"
version = ">= 2.9"
version = ">= 2.16"
}
}

Expand Down
2 changes: 1 addition & 1 deletion patterns/targeted-odcr/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ This pattern demonstrates how to consume/utilize on-demand capacity reservations

## Code

```terraform hl_lines="5-8 81-88 108-131"
```terraform hl_lines="5-8 94-104 124-147"
{% include "../../patterns/targeted-odcr/eks.tf" %}
```

Expand Down
37 changes: 25 additions & 12 deletions patterns/targeted-odcr/eks.tf
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ variable "capacity_reservation_arns" {

module "eks" {
source = "terraform-aws-modules/eks/aws"
version = "~> 20.17"
version = "~> 20.26"

cluster_name = local.name
cluster_version = "1.30"
cluster_version = "1.31"

# Give the Terraform identity admin access to the cluster
# which will allow it to deploy resources into the cluster
Expand All @@ -27,7 +27,9 @@ module "eks" {
coredns = {}
eks-pod-identity-agent = {}
kube-proxy = {}
vpc-cni = {}
vpc-cni = {
most_recent = true
}
}

# Add security group rules on the node group security group to
Expand All @@ -39,16 +41,27 @@ module "eks" {

eks_managed_node_groups = {
odcr = {
# The EKS AL2 GPU AMI provides all of the necessary components
# The EKS AL2023 NVIDIA AMI provides all of the necessary components
# for accelerated workloads w/ EFA
ami_type = "AL2_x86_64_GPU"
instance_type = "p5.48xlarge"

pre_bootstrap_user_data = <<-EOT
# Mount instance store volumes in RAID-0 for kubelet and containerd
# https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
/bin/setup-local-disks raid0
EOT
ami_type = "AL2023_x86_64_NVIDIA"
instance_types = ["p5.48xlarge"]

# Mount instance store volumes in RAID-0 for kubelet and containerd
# https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0
cloudinit_pre_nodeadm = [
{
content_type = "application/node.eks.aws"
content = <<-EOT
---
apiVersion: node.eks.aws/v1alpha1
kind: NodeConfig
spec:
instance:
localStorage:
strategy: RAID0
EOT
}
]

min_size = 2
max_size = 2
Expand Down
Loading

0 comments on commit 9ec1d47

Please sign in to comment.