diff --git a/patterns/ml-capacity-block/README.md b/patterns/ml-capacity-block/README.md index 58f21ee7c6..7dbda5acb6 100644 --- a/patterns/ml-capacity-block/README.md +++ b/patterns/ml-capacity-block/README.md @@ -2,11 +2,11 @@ This pattern demonstrates how to consume/utilize ML capacity block reservations (CBR) with Amazon EKS. The solution is comprised of primarily 2 components: -!!! warning - The use of self-managed node group(s) are required at this time to support capacity block reservations within EKS. This pattern will be updated to demonstrate EKS managed node groups once support has been implemented by the EKS service. - 1. The self-managed node group that will utilize the CBR should have the subnets provided to it restricted to the availability zone where the CBR has been allocated. For example - if the CBR is allocated to `us-west-2b`, the node group should only have subnet IDs provided to it that reside in `us-west-2b`. If the subnets that reside in other AZs are provided, its possible to encounter an error such as `InvalidParameterException: The following supplied instance types do not exist ...`. It is not guaranteed that this error will always be shown, and may appear random since the underlying autoscaling group(s) will provision nodes into different AZs at random. It will only occur when the underlying autoscaling group tries to provision instances into an AZ where capacity is not allocated and there is insufficient on-demand capacity for the desired instance type. + !!! warning + The use of self-managed node group(s) are required at this time to support capacity block reservations within EKS. This pattern will be updated to demonstrate EKS managed node groups once support has been implemented by the EKS service. + 2. The launch template utilized should specify the `instance_market_options` and `capacity_reservation_specification` arguments. This is how the CBR is utilized by the node group (i.e. - tells the autoscaling group to launch instances utilizing provided capacity reservation). Links: @@ -16,7 +16,7 @@ This pattern demonstrates how to consume/utilize ML capacity block reservations ## Code -```terraform hl_lines="53-93" +```terraform hl_lines="5-11 54-56 84-92" {% include "../../patterns/ml-capacity-block/eks.tf" %} ``` diff --git a/patterns/ml-capacity-block/eks.tf b/patterns/ml-capacity-block/eks.tf index da0d7235e3..831d3df95e 100644 --- a/patterns/ml-capacity-block/eks.tf +++ b/patterns/ml-capacity-block/eks.tf @@ -6,7 +6,7 @@ # on how to obtain a ML capacity block reservation. Once acquired, you can provide # the reservation ID through this input to deploy the pattern variable "capacity_reservation_id" { - description = "The ID of the ML capacity block reservation to use for the node group" + description = "The ID of the ML capacity block reservation for the node group" type = string } @@ -27,9 +27,10 @@ module "eks" { cluster_endpoint_public_access = true cluster_addons = { - coredns = {} - kube-proxy = {} - vpc-cni = {} + coredns = {} + eks-pod-identity-agent = {} + kube-proxy = {} + vpc-cni = {} } # Add security group rules on the node group security group to @@ -53,7 +54,7 @@ module "eks" { # Note: ML capacity block reservations are only supported # on self-managed node groups at this time self_managed_node_groups = { - odcr = { + cbr = { # The EKS AL2 GPU AMI provides all of the necessary components # for accelerated workloads w/ EFA ami_type = "AL2_x86_64_GPU" @@ -94,51 +95,3 @@ module "eks" { tags = local.tags } - -################################################################################ -# Helm charts -################################################################################ - -resource "helm_release" "nvidia_device_plugin" { - name = "nvidia-device-plugin" - repository = "https://nvidia.github.io/k8s-device-plugin" - chart = "nvidia-device-plugin" - version = "0.14.5" - namespace = "nvidia-device-plugin" - create_namespace = true - wait = false - - values = [ - <<-EOT - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: 'nvidia.com/gpu.present' - operator: In - values: - - 'true' - EOT - ] -} - -resource "helm_release" "aws_efa_device_plugin" { - name = "aws-efa-k8s-device-plugin" - repository = "https://aws.github.io/eks-charts" - chart = "aws-efa-k8s-device-plugin" - version = "v0.4.4" - namespace = "kube-system" - wait = false - - values = [ - <<-EOT - nodeSelector: - vpc.amazonaws.com/efa.present: 'true' - tolerations: - - key: nvidia.com/gpu - operator: Exists - effect: NoSchedule - EOT - ] -} diff --git a/patterns/ml-capacity-block/helm.tf b/patterns/ml-capacity-block/helm.tf new file mode 100644 index 0000000000..765c26c465 --- /dev/null +++ b/patterns/ml-capacity-block/helm.tf @@ -0,0 +1,47 @@ +################################################################################ +# Helm charts +################################################################################ + +resource "helm_release" "nvidia_device_plugin" { + name = "nvidia-device-plugin" + repository = "https://nvidia.github.io/k8s-device-plugin" + chart = "nvidia-device-plugin" + version = "0.14.5" + namespace = "nvidia-device-plugin" + create_namespace = true + wait = false + + values = [ + <<-EOT + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: 'nvidia.com/gpu.present' + operator: In + values: + - 'true' + EOT + ] +} + +resource "helm_release" "aws_efa_device_plugin" { + name = "aws-efa-k8s-device-plugin" + repository = "https://aws.github.io/eks-charts" + chart = "aws-efa-k8s-device-plugin" + version = "v0.4.4" + namespace = "kube-system" + wait = false + + values = [ + <<-EOT + nodeSelector: + vpc.amazonaws.com/efa.present: 'true' + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + EOT + ] +} diff --git a/patterns/ml-capacity-block/main.tf b/patterns/ml-capacity-block/main.tf index 4fd6d95715..cbf5a8d68f 100644 --- a/patterns/ml-capacity-block/main.tf +++ b/patterns/ml-capacity-block/main.tf @@ -57,6 +57,15 @@ locals { } } +################################################################################ +# Output +################################################################################ + +output "configure_kubectl" { + description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" + value = "aws eks --region ${local.region} update-kubeconfig --name ${module.eks.cluster_name}" +} + ################################################################################ # Supporting Resources ################################################################################ diff --git a/patterns/nvidia-gpu-efa/README.md b/patterns/nvidia-gpu-efa/README.md index e43128c7e2..b352a19863 100644 --- a/patterns/nvidia-gpu-efa/README.md +++ b/patterns/nvidia-gpu-efa/README.md @@ -17,10 +17,14 @@ The following components are demonstrated in this pattern: ## Code -```terraform hl_lines="23-25 31-68" +```terraform hl_lines="24-26 32-67" {% include "../../patterns/nvidia-gpu-efa/eks.tf" %} ``` +```terraform hl_lines="5-47" +{% include "../../patterns/nvidia-gpu-efa/helm.tf" %} +``` + ## Deploy See [here](https://aws-ia.github.io/terraform-aws-eks-blueprints/getting-started/#prerequisites) for the prerequisites and steps to deploy this pattern. diff --git a/patterns/nvidia-gpu-efa/eks.tf b/patterns/nvidia-gpu-efa/eks.tf index 51c927a725..9193ce6bb3 100644 --- a/patterns/nvidia-gpu-efa/eks.tf +++ b/patterns/nvidia-gpu-efa/eks.tf @@ -15,9 +15,10 @@ module "eks" { cluster_endpoint_public_access = true cluster_addons = { - coredns = {} - kube-proxy = {} - vpc-cni = {} + coredns = {} + eks-pod-identity-agent = {} + kube-proxy = {} + vpc-cni = {} } # Add security group rules on the node group security group to @@ -35,8 +36,6 @@ module "eks" { instance_types = ["p5.48xlarge"] pre_bootstrap_user_data = <<-EOT - #!/usr/bin/env bash - # Mount instance store volumes in RAID-0 for kubelet and containerd # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0 /bin/setup-local-disks raid0 @@ -71,18 +70,6 @@ module "eks" { default = { instance_types = ["m5.large"] - # Default AMI has only 8GB of storage - block_device_mappings = { - xvda = { - device_name = "/dev/xvda" - ebs = { - volume_size = 128 - volume_type = "gp3" - delete_on_termination = true - } - } - } - min_size = 1 max_size = 2 desired_size = 2 @@ -91,51 +78,3 @@ module "eks" { tags = local.tags } - -################################################################################ -# Helm charts -################################################################################ - -resource "helm_release" "nvidia_device_plugin" { - name = "nvidia-device-plugin" - repository = "https://nvidia.github.io/k8s-device-plugin" - chart = "nvidia-device-plugin" - version = "0.14.5" - namespace = "nvidia-device-plugin" - create_namespace = true - wait = false - - values = [ - <<-EOT - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: 'nvidia.com/gpu.present' - operator: In - values: - - 'true' - EOT - ] -} - -resource "helm_release" "aws_efa_device_plugin" { - name = "aws-efa-k8s-device-plugin" - repository = "https://aws.github.io/eks-charts" - chart = "aws-efa-k8s-device-plugin" - version = "v0.4.4" - namespace = "kube-system" - wait = false - - values = [ - <<-EOT - nodeSelector: - vpc.amazonaws.com/efa.present: 'true' - tolerations: - - key: nvidia.com/gpu - operator: Exists - effect: NoSchedule - EOT - ] -} diff --git a/patterns/nvidia-gpu-efa/helm.tf b/patterns/nvidia-gpu-efa/helm.tf new file mode 100644 index 0000000000..765c26c465 --- /dev/null +++ b/patterns/nvidia-gpu-efa/helm.tf @@ -0,0 +1,47 @@ +################################################################################ +# Helm charts +################################################################################ + +resource "helm_release" "nvidia_device_plugin" { + name = "nvidia-device-plugin" + repository = "https://nvidia.github.io/k8s-device-plugin" + chart = "nvidia-device-plugin" + version = "0.14.5" + namespace = "nvidia-device-plugin" + create_namespace = true + wait = false + + values = [ + <<-EOT + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: 'nvidia.com/gpu.present' + operator: In + values: + - 'true' + EOT + ] +} + +resource "helm_release" "aws_efa_device_plugin" { + name = "aws-efa-k8s-device-plugin" + repository = "https://aws.github.io/eks-charts" + chart = "aws-efa-k8s-device-plugin" + version = "v0.4.4" + namespace = "kube-system" + wait = false + + values = [ + <<-EOT + nodeSelector: + vpc.amazonaws.com/efa.present: 'true' + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + EOT + ] +} diff --git a/patterns/nvidia-gpu-efa/main.tf b/patterns/nvidia-gpu-efa/main.tf index e11150c06f..03b3fced85 100644 --- a/patterns/nvidia-gpu-efa/main.tf +++ b/patterns/nvidia-gpu-efa/main.tf @@ -57,6 +57,15 @@ locals { } } +################################################################################ +# Output +################################################################################ + +output "configure_kubectl" { + description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" + value = "aws eks --region ${local.region} update-kubeconfig --name ${module.eks.cluster_name}" +} + ################################################################################ # Supporting Resources ################################################################################ diff --git a/patterns/targeted-odcr/README.md b/patterns/targeted-odcr/README.md index 7b2238907e..7b7a3596d6 100644 --- a/patterns/targeted-odcr/README.md +++ b/patterns/targeted-odcr/README.md @@ -18,7 +18,7 @@ This pattern demonstrates how to consume/utilize on-demand capacity reservations ## Code -```terraform hl_lines="34-51" +```terraform hl_lines="5-8 81-88 108-131" {% include "../../patterns/targeted-odcr/eks.tf" %} ``` diff --git a/patterns/targeted-odcr/eks.tf b/patterns/targeted-odcr/eks.tf index e7214a742a..09c3345ccd 100644 --- a/patterns/targeted-odcr/eks.tf +++ b/patterns/targeted-odcr/eks.tf @@ -1,3 +1,12 @@ +################################################################################ +# Required Input +################################################################################ + +variable "capacity_reservation_arns" { + description = "List of on-demand capacity block reservation ARNs for the node group" + type = list(string) +} + ################################################################################ # Cluster ################################################################################ @@ -9,36 +18,66 @@ module "eks" { cluster_name = local.name cluster_version = "1.29" - # To facilitate easier interaction for demonstration purposes - cluster_endpoint_public_access = true - - # Gives Terraform identity admin access to the cluster + # Give the Terraform identity admin access to the cluster + # which will allow it to deploy resources into the cluster enable_cluster_creator_admin_permissions = true + cluster_endpoint_public_access = true cluster_addons = { - coredns = {} - kube-proxy = {} - vpc-cni = {} + coredns = {} + eks-pod-identity-agent = {} + kube-proxy = {} + vpc-cni = {} } + # Add security group rules on the node group security group to + # allow EFA traffic + enable_efa_support = true + vpc_id = module.vpc.vpc_id subnet_ids = module.vpc.private_subnets - eks_managed_node_group_defaults = { - iam_role_additional_policies = { - # Not required, but used in the example to access the nodes to inspect drivers and devices - AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" - } - } - eks_managed_node_groups = { odcr = { - instance_types = ["t3.micro", "t3.small"] + # The EKS AL2 GPU AMI provides all of the necessary components + # for accelerated workloads w/ EFA + ami_type = "AL2_x86_64_GPU" + instance_type = "p5.48xlarge" + + pre_bootstrap_user_data = <<-EOT + # Mount instance store volumes in RAID-0 for kubelet and containerd + # https://github.com/awslabs/amazon-eks-ami/blob/master/doc/USER_GUIDE.md#raid-0-for-kubelet-and-containerd-raid0 + /bin/setup-local-disks raid0 + EOT + + min_size = 2 + max_size = 2 + desired_size = 2 + + # This will: + # 1. Create a placement group to place the instances close to one another + # 2. Ignore subnets that reside in AZs that do not support the instance type + # 3. Expose all of the available EFA interfaces on the launch template + enable_efa_support = true min_size = 4 max_size = 5 desired_size = 2 + labels = { + "vpc.amazonaws.com/efa.present" = "true" + "nvidia.com/gpu.present" = "true" + } + + taints = { + # Ensure only GPU workloads are scheduled on this node group + gpu = { + key = "nvidia.com/gpu" + value = "true" + effect = "NO_SCHEDULE" + } + } + # First subnet is in the "${local.region}a" availability zone # where the capacity reservation is created subnet_ids = [element(module.vpc.private_subnets, 0)] @@ -48,6 +87,15 @@ module "eks" { } } } + + # This node group is for core addons such as CoreDNS + default = { + instance_types = ["m5.large"] + + min_size = 1 + max_size = 2 + desired_size = 2 + } } tags = local.tags @@ -75,55 +123,9 @@ resource "aws_resourcegroups_group" "odcr" { } } -resource "aws_resourcegroups_resource" "odcr_1" { - group_arn = aws_resourcegroups_group.odcr.arn - # Replace the following with the ARN of the capacity reservation - # provided by AWS when supplied with a capacity reservation - resource_arn = aws_ec2_capacity_reservation.micro.arn -} - -resource "aws_resourcegroups_resource" "odcr_2" { - group_arn = aws_resourcegroups_group.odcr.arn - # Replace the following with the ARN of the capacity reservation - # provided by AWS when supplied with a capacity reservation - resource_arn = aws_ec2_capacity_reservation.small.arn -} - -################################################################################ -# Capacity Reservation -# These are created for the example, but are not necessary when -# AWS EC2 provides you with a capacity reservation ID -################################################################################ - -resource "aws_ec2_capacity_reservation" "micro" { - instance_type = "t3.micro" - instance_platform = "Linux/UNIX" - availability_zone = "${local.region}a" - instance_count = 2 - instance_match_criteria = "targeted" - - # Just for example - 30 minutes from time of creation - end_date = timeadd(timestamp(), "30m") - end_date_type = "limited" -} - -resource "aws_ec2_capacity_reservation" "small" { - instance_type = "t3.small" - instance_platform = "Linux/UNIX" - availability_zone = "${local.region}a" - instance_count = 2 - instance_match_criteria = "targeted" - - # Just for example - 30 minutes from time of creation - end_date = timeadd(timestamp(), "30m") - end_date_type = "limited" -} - -################################################################################ -# Kubectl Output -################################################################################ +resource "aws_resourcegroups_resource" "odcr" { + count = length(var.capacity_reservation_arns) -output "configure_kubectl" { - description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" - value = "aws eks --region ${local.region} update-kubeconfig --name ${module.eks.cluster_name}" + group_arn = aws_resourcegroups_group.odcr.arn + resource_arn = element(var.capacity_reservation_arns, count.index) } diff --git a/patterns/targeted-odcr/helm.tf b/patterns/targeted-odcr/helm.tf new file mode 100644 index 0000000000..765c26c465 --- /dev/null +++ b/patterns/targeted-odcr/helm.tf @@ -0,0 +1,47 @@ +################################################################################ +# Helm charts +################################################################################ + +resource "helm_release" "nvidia_device_plugin" { + name = "nvidia-device-plugin" + repository = "https://nvidia.github.io/k8s-device-plugin" + chart = "nvidia-device-plugin" + version = "0.14.5" + namespace = "nvidia-device-plugin" + create_namespace = true + wait = false + + values = [ + <<-EOT + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: 'nvidia.com/gpu.present' + operator: In + values: + - 'true' + EOT + ] +} + +resource "helm_release" "aws_efa_device_plugin" { + name = "aws-efa-k8s-device-plugin" + repository = "https://aws.github.io/eks-charts" + chart = "aws-efa-k8s-device-plugin" + version = "v0.4.4" + namespace = "kube-system" + wait = false + + values = [ + <<-EOT + nodeSelector: + vpc.amazonaws.com/efa.present: 'true' + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + EOT + ] +} diff --git a/patterns/targeted-odcr/main.tf b/patterns/targeted-odcr/main.tf index 7fbd064aa3..00c80fcd59 100644 --- a/patterns/targeted-odcr/main.tf +++ b/patterns/targeted-odcr/main.tf @@ -6,6 +6,10 @@ terraform { source = "hashicorp/aws" version = ">= 5.34" } + helm = { + source = "hashicorp/helm" + version = ">= 2.9" + } } # ## Used for end-to-end testing on project; update to suit your needs @@ -20,6 +24,20 @@ provider "aws" { region = local.region } +provider "helm" { + kubernetes { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + # This requires the awscli to be installed locally where Terraform is executed + args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] + } + } +} + ################################################################################ # Common data/locals ################################################################################ @@ -39,6 +57,15 @@ locals { } } +################################################################################ +# Output +################################################################################ + +output "configure_kubectl" { + description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" + value = "aws eks --region ${local.region} update-kubeconfig --name ${module.eks.cluster_name}" +} + ################################################################################ # Supporting Resources ################################################################################