From 1f91205f54652588d4d1a4f052a814212f95f6fc Mon Sep 17 00:00:00 2001 From: Ruchika Modi <106240341+ruchimo@users.noreply.github.com> Date: Fri, 19 Jan 2024 21:46:43 +0530 Subject: [PATCH 1/5] Update module to use the new container insights with CW for EKS (#252) * Adding Module and Example for ECS cluster monitoring with ecs_observer * Adding Module and Example for ECS cluster monitoring with ecs_observer * Incorporating PR comments * Restructuring Examples and modules folder for ECS, Added content in main Readme * Fixing path as per PR comments * Parameterzing the config files, incorporated PR review comments * Adding condition for AMP WS and fixing AMP endpoint * Adding Document for ECS Monitoring and parameterized some variables * Added sample dashboard * Adding Document for ECS Monitoring and parameterized some variables * Fixing failures detected by pre-commit * Fixing failures detected by pre-commit * Fixing failures detected by pre-commit * Pre-commit fixes * Fixing failures detected by pre-commit * Fixing failures detected by pre-commit * Pre-commit * Fixing HIGH security alerts detected by pre-commit * Fixing HIGH security alerts detected by pre-commit * Fixing HIGH security alerts detected by pre-commit, 31stOct * Add links after merge * 2ndNov - Added condiotnal creation for Grafana WS and module versions for AMG, AMP * Module to use the new container insights with CW for EKS * Module to use the new container insights with CW for EKS * Fixing precommit error * Updating EKS container insights to replace deamonset with tf resource * Updating EKS container insights to replace deamonset with tf resource * Updating EKS container insights to replace deamonset with tf resource * Updating EKS container insights to replace deamonset with tf resource * Updating EKS container insights- Adding doc and fixing pre-commit errors * Update Images in the doc * Update data.tf pre-commit error * Apply suggestions from code review * fixing broken hyperlink * EKS Container Insights - updating docs and few varibaleable names, definitions --- docs/container-insights/eks.md | 49 ++++++++++++-- examples/eks-container-insights/README.md | 16 ++--- examples/eks-container-insights/data.tf | 7 ++ examples/eks-container-insights/locals.tf | 22 +++++-- examples/eks-container-insights/main.tf | 38 ++--------- examples/eks-container-insights/variables.tf | 21 ++++-- examples/eks-container-insights/versions.tf | 20 ------ modules/eks-container-insights/README.md | 36 +++++------ modules/eks-container-insights/data.tf | 7 ++ modules/eks-container-insights/locals.tf | 67 +------------------- modules/eks-container-insights/main.tf | 63 +++++++++++------- modules/eks-container-insights/values.yaml | 20 ------ modules/eks-container-insights/variables.tf | 67 +++++++++----------- modules/eks-container-insights/versions.tf | 12 ---- 14 files changed, 190 insertions(+), 255 deletions(-) create mode 100644 examples/eks-container-insights/data.tf create mode 100644 modules/eks-container-insights/data.tf delete mode 100644 modules/eks-container-insights/values.yaml diff --git a/docs/container-insights/eks.md b/docs/container-insights/eks.md index 104b6739..05689361 100644 --- a/docs/container-insights/eks.md +++ b/docs/container-insights/eks.md @@ -1,8 +1,9 @@ # Setting Up Container Insights for your EKS Cluster -This example deploys AWS Distro of OpenTelemetry on your EKS cluster as a Daemonset which will enable -Container Insights metrics Dashboard on Amazon CloudWatch. +This example deploys CloudWatch Observability EKS add-on on an exisiting Amazon EKS cluster, which enables Container Insights enhanced observability for Amazon EKS and CloudWatch Application Signals by default. +1. Enables the CloudWatch Observability Add-on on EKS using the IAM service account role +2. Creates an IAM Service Linked role for enabling Application Signals ## Prerequisites @@ -34,6 +35,16 @@ Specify the EKS Cluster Name where the resources will be deployed: export TF_VAR_eks_cluster_id=xxx ``` +### 3. Disable creation of `Cloudwatch Application Signals Service-linked Role` +If you already have Application Signals deployed in your AWS account, please set the value of this variable to `false` +``` +variable "create_cloudwatch_application_signals_role" { + type = bool + default = true + description = "Create a Cloudwatch Application Signals service-linked role" +} +``` + ## Deploy Simply run this command to deploy the example @@ -42,12 +53,40 @@ Simply run this command to deploy the example terraform apply ``` -## Visualization +## Enabling Application Signals (preview) for your services +CloudWatch Application Signals (preview) is currenlty supported for **Java** applications running on your Amazon EKS cluster. + +Next, you have to update your Application to `Configure application metrics and trace sampling`. For this, you must add an annotation to a manifest YAML in your cluster. Adding this annotation auto-instruments the application to send metrics, traces, and logs to Application Signals. You have two options for the annotation: + +1. **Annotate Workload** auto-instruments a single workload in the cluster. + - Paste the below line into the PodTemplate section of the workload manifest. + ``` + annotations: instrumentation.opentelemetry.io/inject-java: "true" + ``` + - In your terminal, enter `kubectl apply -f your_deployment_yaml` to apply the change. + +2. **Annotate Namespace** auto-instruments all workloads deployed in the selected namespace. + - Paste the below line into the metadata section of the namespace manifest. + ``` + annotations: instrumentation.opentelemetry.io/inject-java: "true" + ``` + - In your terminal, enter `kubectl apply -f your_namespace_yaml` to apply the change. + - In your terminal, enter a command to restart all pods in the namespace. An example command to restart deployment workloads is `kubectl rollout restart deployment -n namespace_name` + +## Visualization of Container Insights data + +After `terraform apply` is successful, open your Amazon CloudWatch console in the same region as your EKS cluster, then from the left hand side choose `Insights -> Container Insights`, there choose the `EKS` from the drop down and you will see the metrics shown on the dashboard: + +image + + +## Visualization of CloudWatch Application Signals (preview) data -After apply, open your Amazon CloudWatch console in the same region as your EKS cluster, then from the left hand side choose `Insights -> Container Insights`, there choose the `Performance montoring` from the drop down, choose the `cluster name` and you will see the metrics shown on the dashboard: +After enabling your Application to pass metrics and traces by following [the steps provided above](#enabling-application-signals-preview-for-your-services), open your Amazon CloudWatch console in the same region as your EKS cluster, then from the left hand side choose `Application Signals -> Services` and you will see the metrics shown on the sample dashboard below: -![image](https://github.com/aws-observability/terraform-aws-observability-accelerator/assets/10175027/c5b9b685-5894-4350-b68a-ca86d1128f6f) +image +image ## Cleanup diff --git a/examples/eks-container-insights/README.md b/examples/eks-container-insights/README.md index 3d78f9c9..95e4be75 100644 --- a/examples/eks-container-insights/README.md +++ b/examples/eks-container-insights/README.md @@ -1,10 +1,8 @@ # Enable Container Insights for EKS cluster -This example deploys ADOT as a daemonset on your EKS cluster which enables Container Insights metrics on CloudWatch. - -Step-by-step instructions available on our [docs site](https://aws-observability.github.io/terraform-aws-observability-accelerator/) -under **Amazon CloudWatch Container Insights** +This example enables enhanced CloudWatch Container Insights for EKS and CloudWatch Application Signals (preview) through our CloudWatch EKS add-ons, providing comprehensive metrics, logs, and insights for cluster and application monitoring. +Step-by-step instructions available on our [docs site](https://aws-observability.github.io/terraform-aws-observability-accelerator/container-insights/eks/) ## Requirements @@ -13,9 +11,6 @@ under **Amazon CloudWatch Container Insights** |------|---------| | [terraform](#requirement\_terraform) | >= 1.1.0 | | [aws](#requirement\_aws) | >= 5.0.0 | -| [helm](#requirement\_helm) | >= 2.4.1 | -| [kubectl](#requirement\_kubectl) | >= 2.0.3 | -| [kubernetes](#requirement\_kubernetes) | >= 2.10 | ## Providers @@ -42,10 +37,11 @@ under **Amazon CloudWatch Container Insights** | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [aws\_region](#input\_aws\_region) | EKS cluster region | `string` | n/a | yes | +| [eks\_cluster\_domain](#input\_eks\_cluster\_domain) | The domain for the EKS cluster | `string` | `""` | no | | [eks\_cluster\_id](#input\_eks\_cluster\_id) | EKS cluster name | `string` | n/a | yes | -| [irsa\_iam\_permissions\_boundary](#input\_irsa\_iam\_permissions\_boundary) | IAM permissions boundary for IRSA roles | `string` | `null` | no | -| [irsa\_iam\_role\_path](#input\_irsa\_iam\_role\_path) | IAM role path for IRSA roles | `string` | `"/"` | no | +| [eks\_cluster\_version](#input\_eks\_cluster\_version) | The Kubernetes version for the cluster | `string` | `"1.28"` | no | +| [eks\_oidc\_provider](#input\_eks\_oidc\_provider) | The OpenID Connect identity provider (issuer URL without leading `https://`) | `string` | `null` | no | +| [eks\_oidc\_provider\_arn](#input\_eks\_oidc\_provider\_arn) | The OpenID Connect identity provider ARN | `string` | `null` | no | | [tags](#input\_tags) | Additional tags (e.g. `map('BusinessUnit`,`XYZ`) | `map(string)` | `{}` | no | ## Outputs diff --git a/examples/eks-container-insights/data.tf b/examples/eks-container-insights/data.tf new file mode 100644 index 00000000..89c9d09a --- /dev/null +++ b/examples/eks-container-insights/data.tf @@ -0,0 +1,7 @@ +data "aws_partition" "current" {} +data "aws_caller_identity" "current" {} +data "aws_region" "current" {} + +data "aws_eks_cluster" "eks_cluster" { + name = var.eks_cluster_id +} diff --git a/examples/eks-container-insights/locals.tf b/examples/eks-container-insights/locals.tf index f5007a9b..7d265d22 100644 --- a/examples/eks-container-insights/locals.tf +++ b/examples/eks-container-insights/locals.tf @@ -1,9 +1,19 @@ -data "aws_partition" "current" {} +locals { + name = "amazon-cloudwatch-observability" + eks_oidc_issuer_url = replace(data.aws_eks_cluster.eks_cluster.identity[0].oidc[0].issuer, "https://", "") -data "aws_caller_identity" "current" {} + addon_context = { + aws_caller_identity_account_id = data.aws_caller_identity.current.account_id + aws_caller_identity_arn = data.aws_caller_identity.current.arn + aws_partition_id = data.aws_partition.current.partition + aws_region_name = data.aws_region.current.name + eks_oidc_provider_arn = "arn:${data.aws_partition.current.partition}:iam::${data.aws_caller_identity.current.account_id}:oidc-provider/${local.eks_oidc_issuer_url}" + eks_cluster_id = data.aws_eks_cluster.eks_cluster.id + tags = var.tags + } -data "aws_region" "current" {} - -data "aws_eks_cluster" "eks_cluster" { - name = var.eks_cluster_id + addon_config = { + kubernetes_version = var.eks_cluster_version + most_recent = true + } } diff --git a/examples/eks-container-insights/main.tf b/examples/eks-container-insights/main.tf index fc05843c..e47777cd 100644 --- a/examples/eks-container-insights/main.tf +++ b/examples/eks-container-insights/main.tf @@ -1,34 +1,8 @@ -provider "aws" { - region = var.aws_region -} - -provider "kubernetes" { - host = local.eks_cluster_endpoint - cluster_ca_certificate = base64decode(data.aws_eks_cluster.eks_cluster.certificate_authority[0].data) - exec { - api_version = "client.authentication.k8s.io/v1beta1" - args = ["eks", "get-token", "--cluster-name", var.eks_cluster_id] - command = "aws" - } -} - -provider "helm" { - kubernetes { - host = local.eks_cluster_endpoint - cluster_ca_certificate = base64decode(data.aws_eks_cluster.eks_cluster.certificate_authority[0].data) - exec { - api_version = "client.authentication.k8s.io/v1beta1" - args = ["eks", "get-token", "--cluster-name", var.eks_cluster_id] - command = "aws" - } - } -} - - -# Deploy the ADOT Container Insights - module "eks_container_insights" { - source = "../../modules/eks-container-insights" - # source = "github.com/aws-observability/terraform-aws-observability-accelerator//modules/eks-container-insights?ref=v2.5.4" - eks_cluster_id = var.eks_cluster_id + source = "../../modules/eks-container-insights" + eks_cluster_id = var.eks_cluster_id + enable_amazon_eks_cw_observability = true + create_cloudwatch_observability_irsa_role = true + eks_oidc_provider_arn = local.addon_context.eks_oidc_provider_arn + create_cloudwatch_application_signals_role = true } diff --git a/examples/eks-container-insights/variables.tf b/examples/eks-container-insights/variables.tf index 36319c1a..2e9ff5cd 100644 --- a/examples/eks-container-insights/variables.tf +++ b/examples/eks-container-insights/variables.tf @@ -3,23 +3,30 @@ variable "eks_cluster_id" { type = string } -variable "aws_region" { - description = "EKS cluster region" +variable "eks_cluster_domain" { + description = "The domain for the EKS cluster" type = string + default = "" } -variable "irsa_iam_role_path" { - description = "IAM role path for IRSA roles" +variable "eks_oidc_provider" { + description = "The OpenID Connect identity provider (issuer URL without leading `https://`)" type = string - default = "/" + default = null } -variable "irsa_iam_permissions_boundary" { - description = "IAM permissions boundary for IRSA roles" +variable "eks_oidc_provider_arn" { + description = "The OpenID Connect identity provider ARN" type = string default = null } +variable "eks_cluster_version" { + description = "The Kubernetes version for the cluster" + type = string + default = "1.28" +} + variable "tags" { description = "Additional tags (e.g. `map('BusinessUnit`,`XYZ`)" type = map(string) diff --git a/examples/eks-container-insights/versions.tf b/examples/eks-container-insights/versions.tf index a64bfa8e..e426124f 100644 --- a/examples/eks-container-insights/versions.tf +++ b/examples/eks-container-insights/versions.tf @@ -6,25 +6,5 @@ terraform { source = "hashicorp/aws" version = ">= 5.0.0" } - kubernetes = { - source = "hashicorp/kubernetes" - version = ">= 2.10" - } - kubectl = { - source = "alekc/kubectl" - version = ">= 2.0.3" - } - helm = { - source = "hashicorp/helm" - version = ">= 2.4.1" - } } - - # ## Used for end-to-end testing on project; update to suit your needs - # backend "s3" { - # bucket = "aws-observability-accelerator-terraform-states" - # region = "us-west-2" - # key = "e2e/eks_container_insights/terraform.tfstate" - # } - } diff --git a/modules/eks-container-insights/README.md b/modules/eks-container-insights/README.md index b6a67d8d..d0a900f2 100644 --- a/modules/eks-container-insights/README.md +++ b/modules/eks-container-insights/README.md @@ -1,10 +1,10 @@ -# Container Insights ADOT implementation for EKS Cluster Observability +# Container Insights CloudWatch implementation for EKS Cluster Observability -This module provides an automated experience around enabling Container Insights for your EKS cluster using ADOT (AWS Distro for OpenTelemetry). -It provides the following resources: +This module configures AWS CloudWatch Agent used for CloudWatch Application signals and Container Insights. -- ADOT Collector Deployment to your EKS cluster -- Enabling Container Insights on CloudWatch +Use CloudWatch Application Signals to automatically instrument your applications on AWS so that you can monitor current application health and track long-term application performance against your business objectives. Application Signals provides you with a unified, application-centric view of your applications, services, and dependencies, and helps you monitor and triage application health. + +Use CloudWatch Container Insights to collect, aggregate, and summarize metrics and logs from your containerized applications and microservices. CloudWatch automatically collects metrics for many resources, such as CPU, memory, disk, and network. Container Insights also provides diagnostic information, such as container restart failures, to help you isolate issues and resolve them quickly. You can also set CloudWatch alarms on metrics that Container Insights collects. @@ -14,9 +14,6 @@ It provides the following resources: |------|---------| | [terraform](#requirement\_terraform) | >= 1.1.0 | | [aws](#requirement\_aws) | >= 5.0.0 | -| [helm](#requirement\_helm) | >= 2.4.1 | -| [kubectl](#requirement\_kubectl) | >= 2.0.3 | -| [kubernetes](#requirement\_kubernetes) | >= 2.10 | ## Providers @@ -28,15 +25,17 @@ It provides the following resources: | Name | Source | Version | |------|--------|---------| -| [helm\_addon](#module\_helm\_addon) | github.com/aws-ia/terraform-aws-eks-blueprints//modules/kubernetes-addons/helm-addon | v4.32.1 | +| [cloudwatch\_observability\_irsa\_role](#module\_cloudwatch\_observability\_irsa\_role) | terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks | v5.33.0 | ## Resources | Name | Type | |------|------| +| [aws_eks_addon.amazon_cloudwatch_observability](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/eks_addon) | resource | +| [aws_iam_service_linked_role.application_signals_cw](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/iam_service_linked_role) | resource | | [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source | +| [aws_eks_addon_version.eks_addon_version](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/eks_addon_version) | data source | | [aws_eks_cluster.eks_cluster](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/eks_cluster) | data source | -| [aws_iam_policy.irsa](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy) | data source | | [aws_partition.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/partition) | data source | | [aws_region.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/region) | data source | @@ -44,15 +43,14 @@ It provides the following resources: | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [adot\_otel\_helm\_chart\_verison](#input\_adot\_otel\_helm\_chart\_verison) | ADOT collector helm chart version | `string` | `"0.17.0"` | no | -| [eks\_cluster\_id](#input\_eks\_cluster\_id) | EKS Cluster Id | `string` | n/a | yes | -| [helm\_config](#input\_helm\_config) | Helm provider config for adot-exporter-for-eks-on-ec2 | `any` | `{}` | no | -| [irsa\_iam\_permissions\_boundary](#input\_irsa\_iam\_permissions\_boundary) | IAM permissions boundary for IRSA roles | `string` | `null` | no | -| [irsa\_iam\_role\_path](#input\_irsa\_iam\_role\_path) | IAM role path for IRSA roles | `string` | `"/"` | no | -| [irsa\_policies](#input\_irsa\_policies) | Additional IAM policies for a IAM role for service accounts | `list(string)` | `[]` | no | -| [manage\_via\_gitops](#input\_manage\_via\_gitops) | Determines if the add-on should be managed via GitOps. | `bool` | `false` | no | -| [service\_exporters](#input\_service\_exporters) | exporter for adot-ci setup | `string` | `"awsemf"` | no | -| [service\_receivers](#input\_service\_receivers) | receiver for adot-ci setup | `string` | `"awscontainerinsightreceiver"` | no | +| [addon\_config](#input\_addon\_config) | Amazon EKS Managed CloudWatch Observability Add-on config | `any` | `{}` | no | +| [create\_cloudwatch\_application\_signals\_role](#input\_create\_cloudwatch\_application\_signals\_role) | Create a Cloudwatch Application Signals service-linked role | `bool` | `true` | no | +| [create\_cloudwatch\_observability\_irsa\_role](#input\_create\_cloudwatch\_observability\_irsa\_role) | Create a Cloudwatch Observability IRSA | `bool` | `true` | no | +| [eks\_cluster\_id](#input\_eks\_cluster\_id) | Name of the EKS cluster | `string` | `"eks-cw"` | no | +| [eks\_oidc\_provider\_arn](#input\_eks\_oidc\_provider\_arn) | The OIDC Provider ARN of AWS EKS cluster | `string` | `""` | no | +| [enable\_amazon\_eks\_cw\_observability](#input\_enable\_amazon\_eks\_cw\_observability) | Enable Amazon EKS CloudWatch Observability add-on | `bool` | `true` | no | +| [kubernetes\_version](#input\_kubernetes\_version) | Kubernetes version | `string` | `"1.28"` | no | +| [most\_recent](#input\_most\_recent) | Determines if the most recent or default version of the addon should be returned. | `bool` | `false` | no | | [tags](#input\_tags) | Additional tags (e.g. `map('BusinessUnit`,`XYZ`) | `map(string)` | `{}` | no | ## Outputs diff --git a/modules/eks-container-insights/data.tf b/modules/eks-container-insights/data.tf new file mode 100644 index 00000000..89c9d09a --- /dev/null +++ b/modules/eks-container-insights/data.tf @@ -0,0 +1,7 @@ +data "aws_partition" "current" {} +data "aws_caller_identity" "current" {} +data "aws_region" "current" {} + +data "aws_eks_cluster" "eks_cluster" { + name = var.eks_cluster_id +} diff --git a/modules/eks-container-insights/locals.tf b/modules/eks-container-insights/locals.tf index 3246dbec..98e169c9 100644 --- a/modules/eks-container-insights/locals.tf +++ b/modules/eks-container-insights/locals.tf @@ -1,77 +1,14 @@ -data "aws_partition" "current" {} - -data "aws_caller_identity" "current" {} - -data "aws_region" "current" {} - -data "aws_eks_cluster" "eks_cluster" { - name = var.eks_cluster_id -} - -data "aws_iam_policy" "irsa" { - arn = "arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy" -} - locals { - name = "adot-exporter-for-eks-on-ec2" - service_account = try(var.helm_config.service_account, "${local.name}-sa") - - set_values = [ - { - name = "serviceAccount.name" - value = local.service_account - }, - { - name = "serviceAccount.create" - value = false - } - ] - # https://github.com/aws-observability/aws-otel-helm-charts/tree/main/charts/adot-exporter-for-eks-on-ec2 - default_helm_config = { - name = local.name - chart = "adot-exporter-for-eks-on-ec2" - repository = "https://aws-observability.github.io/aws-otel-helm-charts" - version = var.adot_otel_helm_chart_verison - namespace = "amazon-metrics" - values = local.default_helm_values - description = "ADOT Helm Chart Deployment Configuration for Container Insights" - } - - helm_config = merge( - local.default_helm_config, - var.helm_config - ) - - default_helm_values = [templatefile("${path.module}/values.yaml", { - aws_region = local.addon_context.aws_region_name - cluster_name = local.addon_context.eks_cluster_id - service_receivers = format("[\"%s\"]", var.service_receivers) - service_exporters = format("[\"%s\"]", var.service_exporters) - service_account = local.service_account - })] - - irsa_config = { - kubernetes_namespace = local.helm_config["namespace"] - kubernetes_service_account = local.service_account - create_kubernetes_namespace = try(local.helm_config["create_namespace"], true) - create_kubernetes_service_account = true - create_service_account_secret_token = try(local.helm_config["create_service_account_secret_token"], false) - irsa_iam_policies = concat([data.aws_iam_policy.irsa.arn], var.irsa_policies) - } - + kubernetes_version = var.kubernetes_version eks_oidc_issuer_url = replace(data.aws_eks_cluster.eks_cluster.identity[0].oidc[0].issuer, "https://", "") addon_context = { aws_caller_identity_account_id = data.aws_caller_identity.current.account_id aws_caller_identity_arn = data.aws_caller_identity.current.arn - aws_eks_cluster_endpoint = data.aws_eks_cluster.eks_cluster.endpoint aws_partition_id = data.aws_partition.current.partition aws_region_name = data.aws_region.current.name - eks_cluster_id = var.eks_cluster_id - eks_oidc_issuer_url = replace(data.aws_eks_cluster.eks_cluster.identity[0].oidc[0].issuer, "https://", "") eks_oidc_provider_arn = "arn:${data.aws_partition.current.partition}:iam::${data.aws_caller_identity.current.account_id}:oidc-provider/${local.eks_oidc_issuer_url}" + eks_cluster_id = data.aws_eks_cluster.eks_cluster.id tags = var.tags - irsa_iam_role_path = var.irsa_iam_role_path - irsa_iam_permissions_boundary = var.irsa_iam_permissions_boundary } } diff --git a/modules/eks-container-insights/main.tf b/modules/eks-container-insights/main.tf index d4c11538..e48c1cee 100644 --- a/modules/eks-container-insights/main.tf +++ b/modules/eks-container-insights/main.tf @@ -1,30 +1,47 @@ -provider "kubernetes" { - host = data.aws_eks_cluster.eks_cluster.endpoint - cluster_ca_certificate = base64decode(data.aws_eks_cluster.eks_cluster.certificate_authority[0].data) - exec { - api_version = "client.authentication.k8s.io/v1beta1" - args = ["eks", "get-token", "--cluster-name", local.addon_context.eks_cluster_id] - command = "aws" - } +locals { + name = "amazon-cloudwatch-observability" } -provider "helm" { - kubernetes { - host = data.aws_eks_cluster.eks_cluster.endpoint - cluster_ca_certificate = base64decode(data.aws_eks_cluster.eks_cluster.certificate_authority[0].data) - exec { - api_version = "client.authentication.k8s.io/v1beta1" - args = ["eks", "get-token", "--cluster-name", local.addon_context.eks_cluster_id] - command = "aws" +module "cloudwatch_observability_irsa_role" { + count = var.create_cloudwatch_observability_irsa_role ? 1 : 0 + + source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" + version = "v5.33.0" + role_name = "cloudwatch-observability" + attach_cloudwatch_observability_policy = true + + oidc_providers = { + ex = { + provider_arn = var.eks_oidc_provider_arn + namespace_service_accounts = ["amazon-cloudwatch:cloudwatch-agent"] } } } -module "helm_addon" { - source = "github.com/aws-ia/terraform-aws-eks-blueprints//modules/kubernetes-addons/helm-addon?ref=v4.32.1" - manage_via_gitops = var.manage_via_gitops - set_values = local.set_values - helm_config = local.helm_config - irsa_config = local.irsa_config - addon_context = local.addon_context +data "aws_eks_addon_version" "eks_addon_version" { + addon_name = local.name + kubernetes_version = try(var.addon_config.kubernetes_version, var.kubernetes_version) + most_recent = try(var.addon_config.most_recent, true) +} + +resource "aws_eks_addon" "amazon_cloudwatch_observability" { + count = var.enable_amazon_eks_cw_observability ? 1 : 0 + + cluster_name = var.eks_cluster_id + addon_name = local.name + addon_version = try(var.addon_config.addon_version, data.aws_eks_addon_version.eks_addon_version.version) + resolve_conflicts_on_create = try(var.addon_config.resolve_conflicts_on_create, "OVERWRITE") + service_account_role_arn = try(module.cloudwatch_observability_irsa_role[0].iam_role_arn, null) + preserve = try(var.addon_config.preserve, true) + configuration_values = try(var.addon_config.configuration_values, null) + + tags = merge( + # var.addon_context.tags, + try(var.addon_config.tags, {}) + ) +} + +resource "aws_iam_service_linked_role" "application_signals_cw" { + count = var.create_cloudwatch_application_signals_role ? 1 : 0 + aws_service_name = "application-signals.cloudwatch.amazonaws.com" } diff --git a/modules/eks-container-insights/values.yaml b/modules/eks-container-insights/values.yaml deleted file mode 100644 index d6a22ae9..00000000 --- a/modules/eks-container-insights/values.yaml +++ /dev/null @@ -1,20 +0,0 @@ - -awsRegion: ${aws_region} -clusterName: ${cluster_name} - - -adotCollector: - daemonSet: - serviceAccount: - create: false - name: ${service_account} - createNamespace: false - extensions: - sigv4auth: - region: ${aws_region} - service: - metrics: - receivers: ${service_receivers} - exporters: ${service_exporters} - sidecar: - regionS3: ${aws_region} diff --git a/modules/eks-container-insights/variables.tf b/modules/eks-container-insights/variables.tf index 039a7bd5..94c85b0c 100644 --- a/modules/eks-container-insights/variables.tf +++ b/modules/eks-container-insights/variables.tf @@ -1,42 +1,49 @@ -variable "helm_config" { - description = "Helm provider config for adot-exporter-for-eks-on-ec2" - type = any - default = {} +variable "eks_cluster_id" { + description = "Name of the EKS cluster" + default = "eks-cw" + type = string } -variable "manage_via_gitops" { +variable "enable_amazon_eks_cw_observability" { + description = "Enable Amazon EKS CloudWatch Observability add-on" type = bool - description = "Determines if the add-on should be managed via GitOps." - default = false + default = true } -variable "service_receivers" { - type = string - description = "receiver for adot-ci setup" - default = "awscontainerinsightreceiver" +variable "addon_config" { + description = "Amazon EKS Managed CloudWatch Observability Add-on config" + type = any + default = {} } -variable "service_exporters" { +variable "kubernetes_version" { + description = "Kubernetes version" type = string - description = "exporter for adot-ci setup" - default = "awsemf" + default = "1.28" } -variable "irsa_policies" { - description = "Additional IAM policies for a IAM role for service accounts" - type = list(string) - default = [] +variable "most_recent" { + description = "Determines if the most recent or default version of the addon should be returned." + type = bool + default = false } -variable "eks_cluster_id" { - description = "EKS Cluster Id" +variable "eks_oidc_provider_arn" { + description = "The OIDC Provider ARN of AWS EKS cluster" type = string + default = "" } -variable "adot_otel_helm_chart_verison" { - description = "ADOT collector helm chart version" - type = string - default = "0.17.0" +variable "create_cloudwatch_observability_irsa_role" { + type = bool + default = true + description = "Create a Cloudwatch Observability IRSA" +} + +variable "create_cloudwatch_application_signals_role" { + type = bool + default = true + description = "Create a Cloudwatch Application Signals service-linked role" } variable "tags" { @@ -44,15 +51,3 @@ variable "tags" { type = map(string) default = {} } - -variable "irsa_iam_role_path" { - description = "IAM role path for IRSA roles" - type = string - default = "/" -} - -variable "irsa_iam_permissions_boundary" { - description = "IAM permissions boundary for IRSA roles" - type = string - default = null -} diff --git a/modules/eks-container-insights/versions.tf b/modules/eks-container-insights/versions.tf index ea7a421f..e426124f 100644 --- a/modules/eks-container-insights/versions.tf +++ b/modules/eks-container-insights/versions.tf @@ -6,17 +6,5 @@ terraform { source = "hashicorp/aws" version = ">= 5.0.0" } - kubernetes = { - source = "hashicorp/kubernetes" - version = ">= 2.10" - } - kubectl = { - source = "alekc/kubectl" - version = ">= 2.0.3" - } - helm = { - source = "hashicorp/helm" - version = ">= 2.4.1" - } } } From 7697ca975a0e9b842106e1fbeb835d7931f9161e Mon Sep 17 00:00:00 2001 From: Rodrigue Koffi Date: Fri, 19 Jan 2024 20:37:58 +0100 Subject: [PATCH 2/5] chore: Drop base module (#255) * Drop base module * Update examples * Update outputs * Update multicluster example * Fix workspace input * Update module source * Update x account example * fixup! Update x account example * Fix pre-commit * Update documentation --- README.md | 190 ++---------------- docs/concepts.md | 74 +------ docs/contributors.md | 3 - docs/index.md | 19 +- .../data.tf | 5 + .../main.tf | 50 ++--- examples/eks-istio/README.md | 6 +- examples/eks-istio/main.tf | 38 +--- examples/eks-istio/outputs.tf | 8 +- examples/eks-multicluster/data.tf | 10 + examples/eks-multicluster/main.tf | 60 +++--- examples/existing-cluster-java/README.md | 4 +- examples/existing-cluster-java/main.tf | 35 +--- examples/existing-cluster-java/outputs.tf | 8 +- examples/existing-cluster-nginx/README.md | 4 +- examples/existing-cluster-nginx/main.tf | 34 +--- examples/existing-cluster-nginx/outputs.tf | 8 +- .../README.md | 8 +- .../main.tf | 40 +--- .../outputs.tf | 14 +- locals.tf | 17 -- main.tf | 20 -- modules/eks-monitoring/README.md | 6 + modules/eks-monitoring/alerts.tf | 2 +- modules/eks-monitoring/dashboards.tf | 4 +- modules/eks-monitoring/locals.tf | 11 +- modules/eks-monitoring/main.tf | 11 +- modules/eks-monitoring/outputs.tf | 15 ++ modules/eks-monitoring/rules.tf | 2 +- modules/eks-monitoring/variables.tf | 12 ++ outputs.tf | 24 --- variables.tf | 39 ---- versions.tf | 14 -- 33 files changed, 214 insertions(+), 581 deletions(-) delete mode 100644 locals.tf delete mode 100644 main.tf delete mode 100644 outputs.tf delete mode 100644 variables.tf delete mode 100644 versions.tf diff --git a/README.md b/README.md index 78df8d24..c35e0081 100644 --- a/README.md +++ b/README.md @@ -11,140 +11,38 @@ AWS-managed observability services such as Amazon Managed Service for Prometheus Amazon Managed Grafana, AWS Distro for OpenTelemetry (ADOT) and Amazon CloudWatch. We provide curated metrics, logs, traces collection, alerting rules and Grafana -dashboards for your EKS infrastructure, Java/JMX, NGINX based workloads and -your custom applications. - -You also can monitor your Amazon Managed Service for Prometheus workspaces ingestion, -costs, active series with [this module](./modules/managed-prometheus-monitoring). +dashboards for your AWS infrastructure and custom applications. ![image](https://github.com/aws-observability/terraform-aws-observability-accelerator/assets/10175027/e83f8709-f754-4192-90f2-e3de96d2e26c) - ## Documentation To explore the complete project documentation, please visit our [documentation site.](https://aws-observability.github.io/terraform-aws-observability-accelerator/) +## ⚠️ Dropping base module -## ⚠️ Migration to v2.5 - -If you are migrating from earlier versions to v2.5, please follow this guide. - -v2.5.0 removes the dependency to the Terraform Grafana provider in the EKS -monitoring module. As Grafana Operator manages and syncs the Grafana contents, -Terraform is not required anymore in this context. - -However, if you migrate from earlier versions, you might leave some data orphans -as the Grafana provider is dropped. Terraform will throw an error. We have -released [v2.5.0-rc.1](https://github.com/aws-observability/terraform-aws-observability-accelerator/releases/tag/v2.5.0-rc.1) -which removes all the Grafana resources provisioned by Terraform in the EKS -context, without removing the provider configurations. - -- Step 1: migrate to [v2.5.0-rc.1](https://github.com/aws-observability/terraform-aws-observability-accelerator/releases/tag/v2.5.0-rc.1) -and run `apply` -- Step 2: migrate to `v2.5.0` or above - - -## Getting started - -To quick start with a complete workflow and view Amazon EKS infrastructure dashboards, -visit the [Amazon EKS cluster monitoring documentation](https://aws-observability.github.io/terraform-aws-observability-accelerator/eks/) - -## How it works - -The sections below demonstrate how you can leverage AWS Observability Accelerator -to enable monitoring to an existing EKS cluster. - - -### Base Module - -The base module allows you to configure the AWS Observability services for your -cluster and the AWS Distro for OpenTelemetry (ADOT) Operator as the signals -collection mechanism. - -This is the minimum configuration to have a new Amazon Managed Service for -Prometheus Workspace and ADOT Operator deployed for you and ready to receive -your data. The base module serve as an anchor to the workload modules and -cannot run on its own. - -```hcl -module "aws_observability_accelerator" { - # use release tags and check for the latest versions - # https://github.com/aws-observability/terraform-aws-observability-accelerator/releases - source = "github.com/aws-observability/terraform-aws-observability-accelerator?ref=v2.1.0" - - aws_region = "eu-west-1" - eks_cluster_id = "my-eks-cluster" - - # As Grafana shares a different lifecycle, we recommend using an existing workspace. - managed_grafana_workspace_id = var.managed_grafana_workspace_id -} -``` - -You can optionally reuse an existing Amazon Managed Servce for Prometheus Workspace: - -```hcl -module "aws_observability_accelerator" { - # use release tags and check for the latest versions - # https://github.com/aws-observability/terraform-aws-observability-accelerator/releases - source = "github.com/aws-observability/terraform-aws-observability-accelerator?ref=v2.1.0" +Starting in v2.12.0, we have removed the [base module](https://github.com/aws-observability/terraform-aws-observability-accelerator/blob/v2.10.3/docs/concepts.md#base-module) +which previously served as a glue between modules. However, the modules were quite +independent and the base module provided limited functionality. - aws_region = "eu-west-1" - eks_cluster_id = "my-eks-cluster" +We have moved the creation of Managed Prometheus Workspaces into the +[eks-monitoring module](https://github.com/aws-observability/terraform-aws-observability-accelerator/tree/main/modules/eks-monitoring). - # prevents creation of a new Amazon Managed Prometheus workspace - enable_managed_prometheus = false +See our [examples](https://github.com/aws-observability/terraform-aws-observability-accelerator/tree/main/examples) +to understand the changes. - # reusing existing Amazon Managed Prometheus Workspace - managed_prometheus_workspace_id = "ws-abcd123..." - - managed_grafana_workspace_id = "g-abcdef123" -} -``` - -View all the configuration options in the module documentation below. - -### Workload modules - -[Workloads modules](./modules) are provided, which essentially provide curated -metrics, logs, traces collection, alerting rules and Grafana dashboards. - -#### Amazon EKS monitoring - -```hcl -module "eks_monitoring" { - source = "github.com/aws-observability/terraform-aws-observability-accelerator//modules/eks-monitoring?ref=v2.1.0" - - eks_cluster_id = module.eks_observability_accelerator.eks_cluster_id - - dashboards_folder_id = module.eks_observability_accelerator.grafana_dashboards_folder_id - managed_prometheus_workspace_id = module.eks_observability_accelerator.managed_prometheus_workspace_id - - managed_prometheus_workspace_endpoint = module.eks_observability_accelerator.managed_prometheus_workspace_endpoint - managed_prometheus_workspace_region = module.eks_observability_accelerator.managed_prometheus_workspace_region - - enable_logs = true - enable_tracing = true -} -``` +## Modules -#### Amazon ECS monitoring -ECS cluster with VPC and EC2 can be created using the example [here](./examples/ecs_cluster_with_vpc) +This accelerator provides the following observability modules: -```hcl -module "ecs_monitoring" { - source = "github.com/aws-observability/terraform-aws-observability-accelerator//modules/ecs-monitoring" +- [EKS Monitoring with AWS-Managed Open Source](https://aws-observability.github.io/terraform-aws-observability-accelerator/eks/) - Get Prometheus metrics, CloudWatch logs collection, and X-Ray traces (with OTLP support) for your EKS cluster. Visualize key metrics and logs with provided Grafana dashboards and get pre-built alerting rules. - aws_ecs_cluster_name = module.ecs_cluster.cluster_name - task_role_arn = module.ecs_cluster.task_exec_iam_role_arn - execution_role_arn = module.ecs_cluster.task_exec_iam_role_arn -} -``` -Grafana Dashboards +- [EKS Monitoring with Enhanced CloudWatch Container Insights](https://aws-observability.github.io/terraform-aws-observability-accelerator/container-insights/eks/) - Get deep visibility into EKS using Amazon CloudWatch for metrics collection, aggregation, and insights summaries. Includes support for [CloudWatch Application Signals (preview)](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch-Application-Monitoring-Sections.html) -image +- [ECS Monitoring on EC2 with AWS-Managed Open Source](https://aws-observability.github.io/terraform-aws-observability-accelerator/ecs/ecs-monitoring-on-ec2/) - Collect metrics, traces, and logs for ECS on EC2 and send them to a Managed Prometheus workspace , X-Ray, and CloudWatch Logs. Includes pre-built Grafana dashboards for key metrics and logs. +- [Managed Prometheus Monitoring](https://aws-observability.github.io/terraform-aws-observability-accelerator/workloads/managed-prometheus/) - This module sets up automated observability for Amazon Managed Service for Prometheus workspaces, including a Grafana dashboard, CloudWatch monitoring, and service quota alarms. -Check the the [complete example](./examples/existing-cluster-with-base-and-infra/) ## Motivation @@ -156,9 +54,8 @@ Grafana and Amazon OpenSearch. AWS customers have asked for best-practices and guidance to collect metrics, logs and traces from their containerized applications and microservices with ease of deployment. Customers can use the AWS Observability Accelerator to configure their -metrics and traces collection, leveraging [AWS Distro for OpenTelemetry](https://aws-otel.github.io/), -to have opinionated dashboards and alerts available in only minutes. - +metrics and traces collection to have opinionated dashboards and alerts +available in only minutes. ## Support & Feedback @@ -173,59 +70,6 @@ section of this GitHub repo. If you are interested in contributing, see the [Contribution guide](https://github.com/aws-observability/terraform-aws-observability-accelerator/blob/main/CONTRIBUTING.md). ---- - - -## Requirements - -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | >= 1.1.0 | -| [aws](#requirement\_aws) | >= 4.0.0 | -| [awscc](#requirement\_awscc) | >= 0.24.0 | - -## Providers - -| Name | Version | -|------|---------| -| [aws](#provider\_aws) | >= 4.0.0 | - -## Modules - -No modules. - -## Resources - -| Name | Type | -|------|------| -| [aws_prometheus_alert_manager_definition.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/prometheus_alert_manager_definition) | resource | -| [aws_prometheus_workspace.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/prometheus_workspace) | resource | -| [aws_grafana_workspace.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/grafana_workspace) | data source | -| [aws_region.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/region) | data source | - -## Inputs - -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [aws\_region](#input\_aws\_region) | AWS Region | `string` | n/a | yes | -| [enable\_alertmanager](#input\_enable\_alertmanager) | Creates Amazon Managed Service for Prometheus AlertManager for all workloads | `bool` | `false` | no | -| [enable\_managed\_prometheus](#input\_enable\_managed\_prometheus) | Creates a new Amazon Managed Service for Prometheus Workspace | `bool` | `true` | no | -| [managed\_grafana\_workspace\_id](#input\_managed\_grafana\_workspace\_id) | Amazon Managed Grafana Workspace ID | `string` | n/a | yes | -| [managed\_prometheus\_workspace\_id](#input\_managed\_prometheus\_workspace\_id) | Amazon Managed Service for Prometheus Workspace ID | `string` | `""` | no | -| [managed\_prometheus\_workspace\_region](#input\_managed\_prometheus\_workspace\_region) | Region where Amazon Managed Service for Prometheus is deployed | `string` | `null` | no | -| [tags](#input\_tags) | Additional tags (e.g. `map('BusinessUnit`,`XYZ`) | `map(string)` | `{}` | no | - -## Outputs - -| Name | Description | -|------|-------------| -| [aws\_region](#output\_aws\_region) | AWS Region | -| [managed\_grafana\_workspace\_endpoint](#output\_managed\_grafana\_workspace\_endpoint) | Amazon Managed Grafana workspace endpoint | -| [managed\_prometheus\_workspace\_endpoint](#output\_managed\_prometheus\_workspace\_endpoint) | Amazon Managed Prometheus workspace endpoint | -| [managed\_prometheus\_workspace\_id](#output\_managed\_prometheus\_workspace\_id) | Amazon Managed Prometheus workspace ID | -| [managed\_prometheus\_workspace\_region](#output\_managed\_prometheus\_workspace\_region) | Amazon Managed Prometheus workspace region | - - ## Contributing See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information. diff --git a/docs/concepts.md b/docs/concepts.md index 1a4c4641..9633c2a4 100644 --- a/docs/concepts.md +++ b/docs/concepts.md @@ -44,7 +44,6 @@ We have setup a [GitRepository](https://fluxcd.io/flux/components/source/gitrepo We have placed our declarative code snippet to create an Amazon Managed Service For Promethes datasource and Grafana Dashboard in Amazon Managed Grafana in our [AWS Observabiity Accelerator GitHub Repository](https://github.com/aws-observability/aws-observability-accelerator). We have setup a GitRepository to point to the AWS Observabiity Accelerator GitHub Repository and `Kustomization` for flux to sync Git Repository with artifacts in `./artifacts/grafana-operator-manifests/*` path in the AWS Observabiity Accelerator GitHub Repository. You can use this extension of our solution to point your own Kubernetes manifests to create Grafana Datasources and personified Grafana Dashboards of your choice using GitOps with Grafana Operator and Flux in Kubernetes native way with altering and redeploying this solution for changes to Grafana resources. - ## Release notes We encourage you to use our [release versions](https://github.com/aws-observability/terraform-aws-observability-accelerator/releases) @@ -57,82 +56,19 @@ module "eks_monitoring" { } ``` +## Modules -## Base module - -The base module allows you to configure the AWS Observability services for your cluster and -the AWS Distro for OpenTelemetry (ADOT) Operator as the signals collection mechanism. - -This is the minimum configuration to have a new Amazon Managed Service for Prometheus Workspace -and ADOT Operator deployed for you and ready to receive your data. -The base module serve as an anchor to the workload modules and cannot run on its own. - -```hcl -module "aws_observability_accelerator" { - # use release tags and check for the latest versions - # https://github.com/aws-observability/terraform-aws-observability-accelerator/releases - source = "github.com/aws-observability/terraform-aws-observability-accelerator?ref=v1.6.1" - - aws_region = "eu-west-1" - eks_cluster_id = "my-eks-cluster" - - # As Grafana shares a different lifecycle, we recommend using an existing workspace. - managed_grafana_workspace_id = var.managed_grafana_workspace_id -} -``` - -You can optionally reuse an existing Amazon Managed Service for Prometheus Workspace: - -```hcl -module "aws_observability_accelerator" { - # use release tags and check for the latest versions - # https://github.com/aws-observability/terraform-aws-observability-accelerator/releases - source = "github.com/aws-observability/terraform-aws-observability-accelerator?ref=v1.6.1" - - aws_region = "eu-west-1" - eks_cluster_id = "my-eks-cluster" - - # prevents creation of a new Amazon Managed Prometheus workspace - enable_managed_prometheus = false - - # reusing existing Amazon Managed Prometheus Workspace - managed_prometheus_workspace_id = "ws-abcd123..." - - managed_grafana_workspace_id = "g-abcdef123" -} -``` - -View all the configuration options in the [module's documentation](https://github.com/aws-observability/terraform-aws-observability-accelerator#requirements) - -## Workload modules - -Workloads modules are focused Terraform modules provided in this repository. They essentially provide curated metrics collection, alerts and Grafana dashboards according to the use case. Most of those modules require the base module. - -You can check the full workload modules list and their documentation [here](https://github.com/aws-observability/terraform-aws-observability-accelerator/tree/main/modules/). - +[Modules](https://github.com/aws-observability/terraform-aws-observability-accelerator/tree/main/modules/) +are set of functionalities (ex: Managed Open-Source EKS monitoring, CloudWatch Container Insights, ...) +packaged together that can be used to add Observability to your environments. All the modules come with end-to-end deployable examples. ## Examples -[Examples](https://github.com/aws-observability/terraform-aws-observability-accelerator/tree/main/examples) put modules together in a ready to deploy terraform configuration as a starting point. With little to no configuration, you can run `terraform apply` and use the deployed resources on your AWS Account. +[Examples](https://github.com/aws-observability/terraform-aws-observability-accelerator/tree/main/examples) put [modules](https://github.com/aws-observability/terraform-aws-observability-accelerator/tree/main/modules/) together in a ready to deploy terraform configuration as a starting point. With little to no configuration, you can run `terraform apply` and use the deployed resources on your AWS Account. You can find **workload** examples like [Amazon EKS infrastructure monitoring](https://aws-observability.github.io/terraform-aws-observability-accelerator/eks/) or [monitoring your Amazon Managed Service for Prometheus workspace](https://aws-observability.github.io/terraform-aws-observability-accelerator/workloads/managed-prometheus/) and more. -```mermaid -classDiagram - Example <|-- Base Module - Example <|-- Workload Module - class Base Module{ - Amazon Managed Prometheus - Amazon Managed Grafana Data Sources - } - class Workload Module{ - Amazon Distro for Open Telemetry Config - Amazon Managed Prometheus Alerts - Amazon Managed Grafana Dashboards - } -``` - ## Getting started with AWS Observability services diff --git a/docs/contributors.md b/docs/contributors.md index 8e742837..0c893128 100644 --- a/docs/contributors.md +++ b/docs/contributors.md @@ -14,11 +14,8 @@ The core team include the following people: * Jerome DECQ * Kevin Lewin * Michael Hausenblas -* Munish Dabra * Rodrigue Koffi * Toshal Dudhwala -* Vara Bonthu -* Vikram Venkataraman We welcome the wider open source community and thank [those who contribute](https://github.com/aws-observability/terraform-aws-observability-accelerator/graphs/contributors) to this project. diff --git a/docs/index.md b/docs/index.md index 4407a022..e8c44840 100644 --- a/docs/index.md +++ b/docs/index.md @@ -8,11 +8,7 @@ AWS-managed observability services such as Amazon Managed Service for Prometheus Amazon Managed Grafana, AWS Distro for OpenTelemetry (ADOT) and Amazon CloudWatch. We provide curated metrics, logs, traces collection, alerting rules and Grafana -dashboards for your EKS infrastructure, Java/JMX, NGINX based workloads and -your custom applications. - -You also can monitor your Amazon Managed Service for Prometheus workspaces ingestion, -costs, active series with [this module](https://aws-observability.github.io/terraform-aws-observability-accelerator/workloads/managed-prometheus/). +dashboards for your AWS infrastructure and custom applications. ![image](https://github.com/aws-observability/terraform-aws-observability-accelerator/assets/10175027/e83f8709-f754-4192-90f2-e3de96d2e26c) @@ -21,12 +17,13 @@ costs, active series with [this module](https://aws-observability.github.io/terr This project provides a set of Terraform modules to enable metrics, logs and traces collection, dashboards and alerts for monitoring: -- Amazon EKS clusters infrastructure and applications -- NGINX workloads (running on Amazon EKS) -- Java/JMX workloads (running on Amazon EKS) -- Amazon Managed Service for Prometheus workspaces with Amazon CloudWatch -- [Grafana Operator](https://github.com/grafana-operator/grafana-operator) and [Flux CD](https://fluxcd.io/) to manage Grafana contents (AWS data sources, Grafana Dashboards) with GitOps -- External Secrets Operator to retrieve and sync the Grafana API keys +- [EKS Monitoring with AWS-Managed Open Source](https://aws-observability.github.io/terraform-aws-observability-accelerator/eks/) - Get Prometheus metrics, CloudWatch logs collection, and X-Ray traces (with OTLP support) for your EKS cluster. Visualize key metrics and logs with provided Grafana dashboards and get pre-built alerting rules. + +- [EKS Monitoring with Enhanced CloudWatch Container Insights](https://aws-observability.github.io/terraform-aws-observability-accelerator/container-insights/eks/) - Get deep visibility into EKS using Amazon CloudWatch for metrics collection, aggregation, and insights summaries. Includes support for [CloudWatch Application Signals (preview)](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/CloudWatch-Application-Monitoring-Sections.html). + +- [ECS Monitoring on EC2 with AWS-Managed Open Source](https://aws-observability.github.io/terraform-aws-observability-accelerator/ecs/ecs-monitoring-on-ec2/) - Collect metrics, traces, and logs for ECS on EC2 and send them to a Managed Prometheus workspace , X-Ray, and CloudWatch Logs. Includes pre-built Grafana dashboards for key metrics and logs. + +- [Managed Prometheus Monitoring](https://aws-observability.github.io/terraform-aws-observability-accelerator/workloads/managed-prometheus/) - This module sets up automated observability for Amazon Managed Service for Prometheus workspaces, including a Grafana dashboard, CloudWatch monitoring, and service quota alarms. These modules can be directly configured in your existing Terraform configurations or ready to be deployed in our packaged diff --git a/examples/eks-cross-account-with-central-amp/data.tf b/examples/eks-cross-account-with-central-amp/data.tf index e87543fa..88c91c4d 100644 --- a/examples/eks-cross-account-with-central-amp/data.tf +++ b/examples/eks-cross-account-with-central-amp/data.tf @@ -17,3 +17,8 @@ data "aws_eks_cluster" "eks_two" { name = var.cluster_two.name provider = aws.eks_cluster_two } + +data "aws_grafana_workspace" "this" { + workspace_id = var.monitoring.managed_grafana_id + provider = aws.eks_cluster_one +} diff --git a/examples/eks-cross-account-with-central-amp/main.tf b/examples/eks-cross-account-with-central-amp/main.tf index 9c608b45..d48f8cbc 100644 --- a/examples/eks-cross-account-with-central-amp/main.tf +++ b/examples/eks-cross-account-with-central-amp/main.tf @@ -6,7 +6,7 @@ locals { # EKS Monitoring Addon for cluster one # ########################################################################### module "eks_monitoring_one" { - source = "../../modules/eks-monitoring" + source = "../..//modules/eks-monitoring" # source = "github.com/aws-observability/terraform-aws-observability-accelerator//modules/eks-monitoring?ref=v2.0.0" providers = { aws = aws.eks_cluster_one @@ -36,11 +36,15 @@ module "eks_monitoring_one" { grafana_api_key = aws_grafana_workspace_api_key.key.key target_secret_name = "grafana-admin-credentials" target_secret_namespace = "grafana-operator" - grafana_url = module.aws_observability_accelerator.managed_grafana_workspace_endpoint + grafana_url = "https://${data.aws_grafana_workspace.this.endpoint}" - managed_prometheus_workspace_id = module.aws_observability_accelerator.managed_prometheus_workspace_id - managed_prometheus_workspace_endpoint = module.aws_observability_accelerator.managed_prometheus_workspace_endpoint - managed_prometheus_workspace_region = module.aws_observability_accelerator.managed_prometheus_workspace_region + + # prevents the module to create a workspace + enable_managed_prometheus = false + + managed_prometheus_workspace_id = module.managed_service_prometheus.workspace_id + managed_prometheus_workspace_endpoint = module.managed_service_prometheus.workspace_prometheus_endpoint + managed_prometheus_workspace_region = var.cluster_one.region managed_prometheus_cross_account_role = aws_iam_role.cross_account_amp_role.arn irsa_iam_additional_policies = [aws_iam_policy.irsa_assume_role_policy_one.arn] @@ -51,10 +55,6 @@ module "eks_monitoring_one" { } enable_logs = true - - depends_on = [ - module.aws_observability_accelerator - ] } @@ -63,7 +63,7 @@ module "eks_monitoring_one" { ########################################################################### module "eks_monitoring_two" { - source = "../../modules/eks-monitoring" + source = "../..//modules/eks-monitoring" # source = "github.com/aws-observability/terraform-aws-observability-accelerator//modules/eks-monitoring?ref=v2.0.0" providers = { aws = aws.eks_cluster_two @@ -91,11 +91,15 @@ module "eks_monitoring_two" { grafana_api_key = aws_grafana_workspace_api_key.key.key target_secret_name = "grafana-admin-credentials" target_secret_namespace = "grafana-operator" - grafana_url = module.aws_observability_accelerator.managed_grafana_workspace_endpoint + grafana_url = "https://${data.aws_grafana_workspace.this.endpoint}" + + # prevents the module to create a workspace + enable_managed_prometheus = false + + managed_prometheus_workspace_id = module.managed_service_prometheus.workspace_id + managed_prometheus_workspace_endpoint = module.managed_service_prometheus.workspace_prometheus_endpoint + managed_prometheus_workspace_region = var.cluster_two.region - managed_prometheus_workspace_id = module.aws_observability_accelerator.managed_prometheus_workspace_id - managed_prometheus_workspace_endpoint = module.aws_observability_accelerator.managed_prometheus_workspace_endpoint - managed_prometheus_workspace_region = module.aws_observability_accelerator.managed_prometheus_workspace_region managed_prometheus_cross_account_role = aws_iam_role.cross_account_amp_role.arn irsa_iam_additional_policies = [aws_iam_policy.irsa_assume_role_policy_two.arn] @@ -106,10 +110,6 @@ module "eks_monitoring_two" { } enable_logs = true - - depends_on = [ - module.aws_observability_accelerator - ] } ########################################################################### @@ -133,17 +133,3 @@ module "managed_service_prometheus" { workspace_alias = local.amp_workspace_alias } - -module "aws_observability_accelerator" { - source = "../../../terraform-aws-observability-accelerator" - aws_region = var.monitoring.region - enable_managed_prometheus = false - enable_alertmanager = false - managed_prometheus_workspace_region = var.monitoring.region - managed_prometheus_workspace_id = module.managed_service_prometheus.workspace_id - managed_grafana_workspace_id = var.monitoring.managed_grafana_id - - providers = { - aws = aws.central_monitoring - } -} diff --git a/examples/eks-istio/README.md b/examples/eks-istio/README.md index 873b4a0d..3a88dd41 100644 --- a/examples/eks-istio/README.md +++ b/examples/eks-istio/README.md @@ -1,4 +1,4 @@ -# Existing Cluster with the AWS Observability accelerator base module, Tetrate Istio Add-on and Istio monitoring +# Existing Cluster with Tetrate Istio Add-on and Istio monitoring View the full documentation for this example [here](https://aws-observability.github.io/terraform-aws-observability-accelerator/eks/istio) @@ -23,7 +23,6 @@ View the full documentation for this example [here](https://aws-observability.gi | Name | Source | Version | |------|--------|---------| -| [aws\_observability\_accelerator](#module\_aws\_observability\_accelerator) | ../../ | n/a | | [eks\_blueprints\_kubernetes\_addons](#module\_eks\_blueprints\_kubernetes\_addons) | github.com/aws-ia/terraform-aws-eks-blueprints//modules/kubernetes-addons | v4.32.1 | | [eks\_monitoring](#module\_eks\_monitoring) | ../../modules/eks-monitoring | n/a | @@ -33,6 +32,7 @@ View the full documentation for this example [here](https://aws-observability.gi |------|------| | [aws_eks_cluster.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/eks_cluster) | data source | | [aws_eks_cluster_auth.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/eks_cluster_auth) | data source | +| [aws_grafana_workspace.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/grafana_workspace) | data source | ## Inputs @@ -49,9 +49,9 @@ View the full documentation for this example [here](https://aws-observability.gi | Name | Description | |------|-------------| -| [aws\_region](#output\_aws\_region) | AWS Region | | [eks\_cluster\_id](#output\_eks\_cluster\_id) | EKS Cluster Id | | [eks\_cluster\_version](#output\_eks\_cluster\_version) | EKS Cluster version | | [managed\_prometheus\_workspace\_endpoint](#output\_managed\_prometheus\_workspace\_endpoint) | Amazon Managed Prometheus workspace endpoint | | [managed\_prometheus\_workspace\_id](#output\_managed\_prometheus\_workspace\_id) | Amazon Managed Prometheus workspace ID | +| [managed\_prometheus\_workspace\_region](#output\_managed\_prometheus\_workspace\_region) | AWS Region | diff --git a/examples/eks-istio/main.tf b/examples/eks-istio/main.tf index 4d63ca9a..138887ce 100644 --- a/examples/eks-istio/main.tf +++ b/examples/eks-istio/main.tf @@ -10,6 +10,10 @@ data "aws_eks_cluster" "this" { name = var.eks_cluster_id } +data "aws_grafana_workspace" "this" { + workspace_id = var.managed_grafana_workspace_id +} + provider "kubernetes" { host = local.eks_cluster_endpoint cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data) @@ -33,28 +37,6 @@ locals { } } -# deploys the base module -module "aws_observability_accelerator" { - source = "../../" - # source = "github.com/aws-observability/terraform-aws-observability-accelerator?ref=v2.0.0" - - aws_region = var.aws_region - - # creates a new Amazon Managed Prometheus workspace, defaults to true - enable_managed_prometheus = local.create_new_workspace - - # reusing existing Amazon Managed Prometheus if specified - managed_prometheus_workspace_id = var.managed_prometheus_workspace_id - - # sets up the Amazon Managed Prometheus alert manager at the workspace level - enable_alertmanager = true - - # reusing existing Amazon Managed Grafana workspace - managed_grafana_workspace_id = var.managed_grafana_workspace_id - - tags = local.tags -} - module "eks_blueprints_kubernetes_addons" { source = "github.com/aws-ia/terraform-aws-eks-blueprints//modules/kubernetes-addons?ref=v4.32.1" @@ -95,15 +77,13 @@ module "eks_monitoring" { grafana_api_key = var.grafana_api_key target_secret_name = "grafana-admin-credentials" target_secret_namespace = "grafana-operator" - grafana_url = module.aws_observability_accelerator.managed_grafana_workspace_endpoint + grafana_url = "https://${data.aws_grafana_workspace.this.endpoint}" # control the publishing of dashboards by specifying the boolean value for the variable 'enable_dashboards', default is 'true' enable_dashboards = var.enable_dashboards - managed_prometheus_workspace_id = module.aws_observability_accelerator.managed_prometheus_workspace_id - - managed_prometheus_workspace_endpoint = module.aws_observability_accelerator.managed_prometheus_workspace_endpoint - managed_prometheus_workspace_region = module.aws_observability_accelerator.managed_prometheus_workspace_region + enable_managed_prometheus = local.create_new_workspace + managed_prometheus_workspace_id = var.managed_prometheus_workspace_id # optional, defaults to 60s interval and 15s timeout prometheus_config = { @@ -114,8 +94,4 @@ module "eks_monitoring" { enable_logs = true tags = local.tags - - depends_on = [ - module.aws_observability_accelerator - ] } diff --git a/examples/eks-istio/outputs.tf b/examples/eks-istio/outputs.tf index ad1c3405..e14427e3 100644 --- a/examples/eks-istio/outputs.tf +++ b/examples/eks-istio/outputs.tf @@ -1,16 +1,16 @@ -output "aws_region" { +output "managed_prometheus_workspace_region" { description = "AWS Region" - value = module.aws_observability_accelerator.aws_region + value = module.eks_monitoring.managed_prometheus_workspace_region } output "managed_prometheus_workspace_endpoint" { description = "Amazon Managed Prometheus workspace endpoint" - value = module.aws_observability_accelerator.managed_prometheus_workspace_endpoint + value = module.eks_monitoring.managed_prometheus_workspace_endpoint } output "managed_prometheus_workspace_id" { description = "Amazon Managed Prometheus workspace ID" - value = module.aws_observability_accelerator.managed_prometheus_workspace_id + value = module.eks_monitoring.managed_prometheus_workspace_id } output "eks_cluster_version" { diff --git a/examples/eks-multicluster/data.tf b/examples/eks-multicluster/data.tf index acc5d558..2a25fd5e 100644 --- a/examples/eks-multicluster/data.tf +++ b/examples/eks-multicluster/data.tf @@ -17,3 +17,13 @@ data "aws_eks_cluster" "eks_cluster_2" { name = var.eks_cluster_2_id provider = aws.eks_cluster_2 } + +data "aws_grafana_workspace" "this" { + workspace_id = var.managed_grafana_workspace_id + provider = aws.eks_cluster_1 +} + +data "aws_prometheus_workspace" "this" { + workspace_id = local.managed_prometheus_workspace_id + provider = aws.eks_cluster_1 +} diff --git a/examples/eks-multicluster/main.tf b/examples/eks-multicluster/main.tf index 8e3034b5..d5b8e72c 100644 --- a/examples/eks-multicluster/main.tf +++ b/examples/eks-multicluster/main.tf @@ -1,19 +1,9 @@ -module "aws_observability_accelerator" { - source = "../../../terraform-aws-observability-accelerator" - aws_region = var.eks_cluster_1_region - enable_managed_prometheus = false - enable_alertmanager = true - managed_prometheus_workspace_region = null - managed_prometheus_workspace_id = var.managed_prometheus_workspace_id - managed_grafana_workspace_id = var.managed_grafana_workspace_id - - providers = { - aws = aws.eks_cluster_1 - } +locals { + create_new_workspace = var.managed_prometheus_workspace_id == "" ? true : false + managed_prometheus_workspace_id = local.create_new_workspace ? module.managed_service_prometheus[0].workspace_id : var.managed_prometheus_workspace_id } - module "eks_cluster_1_monitoring" { - source = "../../../terraform-aws-observability-accelerator//modules/eks-monitoring" + source = "../..//modules/eks-monitoring" eks_cluster_id = var.eks_cluster_1_id enable_amazon_eks_adot = true enable_cert_manager = true @@ -31,11 +21,15 @@ module "eks_cluster_1_monitoring" { enable_apiserver_monitoring = true enable_adotcollector_metrics = true - grafana_api_key = var.grafana_api_key - managed_prometheus_workspace_id = module.aws_observability_accelerator.managed_prometheus_workspace_id - managed_prometheus_workspace_endpoint = module.aws_observability_accelerator.managed_prometheus_workspace_endpoint - managed_prometheus_workspace_region = module.aws_observability_accelerator.managed_prometheus_workspace_region - grafana_url = module.aws_observability_accelerator.managed_grafana_workspace_endpoint + grafana_api_key = var.grafana_api_key + grafana_url = "https://${data.aws_grafana_workspace.this.endpoint}" + + # prevents the module to create a workspace + enable_managed_prometheus = false + + managed_prometheus_workspace_id = local.managed_prometheus_workspace_id + managed_prometheus_workspace_endpoint = data.aws_prometheus_workspace.this.prometheus_endpoint + managed_prometheus_workspace_region = var.eks_cluster_1_region prometheus_config = { global_scrape_interval = "60s" @@ -48,14 +42,10 @@ module "eks_cluster_1_monitoring" { kubernetes = kubernetes.eks_cluster_1 helm = helm.eks_cluster_1 } - - depends_on = [ - module.aws_observability_accelerator - ] } module "eks_cluster_2_monitoring" { - source = "../../../terraform-aws-observability-accelerator//modules/eks-monitoring" + source = "../..//modules/eks-monitoring" eks_cluster_id = var.eks_cluster_2_id enable_amazon_eks_adot = true enable_cert_manager = true @@ -73,9 +63,12 @@ module "eks_cluster_2_monitoring" { enable_apiserver_monitoring = false enable_adotcollector_metrics = false - managed_prometheus_workspace_id = module.aws_observability_accelerator.managed_prometheus_workspace_id - managed_prometheus_workspace_endpoint = module.aws_observability_accelerator.managed_prometheus_workspace_endpoint - managed_prometheus_workspace_region = module.aws_observability_accelerator.managed_prometheus_workspace_region + # prevents the module to create a workspace + enable_managed_prometheus = false + + managed_prometheus_workspace_id = var.managed_prometheus_workspace_id + managed_prometheus_workspace_endpoint = data.aws_prometheus_workspace.this.prometheus_endpoint + managed_prometheus_workspace_region = var.eks_cluster_1_region prometheus_config = { global_scrape_interval = "60s" @@ -88,8 +81,15 @@ module "eks_cluster_2_monitoring" { kubernetes = kubernetes.eks_cluster_2 helm = helm.eks_cluster_2 } +} + +module "managed_service_prometheus" { + count = local.create_new_workspace ? 1 : 0 + source = "terraform-aws-modules/managed-service-prometheus/aws" + version = "~> 2.2.2" + providers = { + aws = aws.eks_cluster_1 + } - depends_on = [ - module.aws_observability_accelerator - ] + workspace_alias = "aws-observability-accelerator-multicluster" } diff --git a/examples/existing-cluster-java/README.md b/examples/existing-cluster-java/README.md index 571fa68e..1b1e08b2 100644 --- a/examples/existing-cluster-java/README.md +++ b/examples/existing-cluster-java/README.md @@ -207,7 +207,6 @@ terraform destroy -var-file=terraform.tfvars | Name | Source | Version | |------|--------|---------| -| [aws\_observability\_accelerator](#module\_aws\_observability\_accelerator) | ../../ | n/a | | [eks\_monitoring](#module\_eks\_monitoring) | ../../modules/eks-monitoring | n/a | ## Resources @@ -216,6 +215,7 @@ terraform destroy -var-file=terraform.tfvars |------|------| | [aws_eks_cluster.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/eks_cluster) | data source | | [aws_eks_cluster_auth.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/eks_cluster_auth) | data source | +| [aws_grafana_workspace.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/grafana_workspace) | data source | ## Inputs @@ -232,9 +232,9 @@ terraform destroy -var-file=terraform.tfvars | Name | Description | |------|-------------| -| [aws\_region](#output\_aws\_region) | AWS Region | | [eks\_cluster\_id](#output\_eks\_cluster\_id) | EKS Cluster Id | | [eks\_cluster\_version](#output\_eks\_cluster\_version) | EKS Cluster version | | [managed\_prometheus\_workspace\_endpoint](#output\_managed\_prometheus\_workspace\_endpoint) | Amazon Managed Prometheus workspace endpoint | | [managed\_prometheus\_workspace\_id](#output\_managed\_prometheus\_workspace\_id) | Amazon Managed Prometheus workspace ID | +| [managed\_prometheus\_workspace\_region](#output\_managed\_prometheus\_workspace\_region) | AWS Region | diff --git a/examples/existing-cluster-java/main.tf b/examples/existing-cluster-java/main.tf index 302b4ced..7a3af6d7 100644 --- a/examples/existing-cluster-java/main.tf +++ b/examples/existing-cluster-java/main.tf @@ -10,6 +10,10 @@ data "aws_eks_cluster" "this" { name = var.eks_cluster_id } +data "aws_grafana_workspace" "this" { + workspace_id = var.managed_grafana_workspace_id +} + provider "kubernetes" { host = local.eks_cluster_endpoint cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data) @@ -33,25 +37,6 @@ locals { } } -# deploys the base module -module "aws_observability_accelerator" { - source = "../../" - # source = "github.com/aws-observability/terraform-aws-observability-accelerator?ref=v2.0.0" - - aws_region = var.aws_region - - # creates a new Amazon Managed Prometheus workspace, defaults to true - enable_managed_prometheus = local.create_new_workspace - - # reusing existing Amazon Managed Prometheus if specified - managed_prometheus_workspace_id = var.managed_prometheus_workspace_id - - # reusing existing Amazon Managed Grafana workspace - managed_grafana_workspace_id = var.managed_grafana_workspace_id - - tags = local.tags -} - module "eks_monitoring" { source = "../../modules/eks-monitoring" # source = "github.com/aws-observability/terraform-aws-observability-accelerator//modules/eks-monitoring?ref=v2.0.0" @@ -64,17 +49,15 @@ module "eks_monitoring" { grafana_api_key = var.grafana_api_key target_secret_name = "grafana-admin-credentials" target_secret_namespace = "grafana-operator" - grafana_url = module.aws_observability_accelerator.managed_grafana_workspace_endpoint + grafana_url = "https://${data.aws_grafana_workspace.this.endpoint}" eks_cluster_id = var.eks_cluster_id # control the publishing of dashboards by specifying the boolean value for the variable 'enable_dashboards', default is 'true' enable_dashboards = var.enable_dashboards - managed_prometheus_workspace_id = module.aws_observability_accelerator.managed_prometheus_workspace_id - - managed_prometheus_workspace_endpoint = module.aws_observability_accelerator.managed_prometheus_workspace_endpoint - managed_prometheus_workspace_region = module.aws_observability_accelerator.managed_prometheus_workspace_region + enable_managed_prometheus = local.create_new_workspace + managed_prometheus_workspace_id = var.managed_prometheus_workspace_id # optional, defaults to 60s interval and 15s timeout prometheus_config = { @@ -86,8 +69,4 @@ module "eks_monitoring" { enable_logs = true tags = local.tags - - depends_on = [ - module.aws_observability_accelerator - ] } diff --git a/examples/existing-cluster-java/outputs.tf b/examples/existing-cluster-java/outputs.tf index ad1c3405..e14427e3 100644 --- a/examples/existing-cluster-java/outputs.tf +++ b/examples/existing-cluster-java/outputs.tf @@ -1,16 +1,16 @@ -output "aws_region" { +output "managed_prometheus_workspace_region" { description = "AWS Region" - value = module.aws_observability_accelerator.aws_region + value = module.eks_monitoring.managed_prometheus_workspace_region } output "managed_prometheus_workspace_endpoint" { description = "Amazon Managed Prometheus workspace endpoint" - value = module.aws_observability_accelerator.managed_prometheus_workspace_endpoint + value = module.eks_monitoring.managed_prometheus_workspace_endpoint } output "managed_prometheus_workspace_id" { description = "Amazon Managed Prometheus workspace ID" - value = module.aws_observability_accelerator.managed_prometheus_workspace_id + value = module.eks_monitoring.managed_prometheus_workspace_id } output "eks_cluster_version" { diff --git a/examples/existing-cluster-nginx/README.md b/examples/existing-cluster-nginx/README.md index 066734c0..dad16832 100644 --- a/examples/existing-cluster-nginx/README.md +++ b/examples/existing-cluster-nginx/README.md @@ -218,7 +218,6 @@ add this `managed_prometheus_region=xxx` and `managed_prometheus_workspace_id=ws | Name | Source | Version | |------|--------|---------| -| [aws\_observability\_accelerator](#module\_aws\_observability\_accelerator) | ../../ | n/a | | [eks\_monitoring](#module\_eks\_monitoring) | ../../modules/eks-monitoring | n/a | ## Resources @@ -227,6 +226,7 @@ add this `managed_prometheus_region=xxx` and `managed_prometheus_workspace_id=ws |------|------| | [aws_eks_cluster.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/eks_cluster) | data source | | [aws_eks_cluster_auth.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/eks_cluster_auth) | data source | +| [aws_grafana_workspace.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/grafana_workspace) | data source | ## Inputs @@ -243,9 +243,9 @@ add this `managed_prometheus_region=xxx` and `managed_prometheus_workspace_id=ws | Name | Description | |------|-------------| -| [aws\_region](#output\_aws\_region) | AWS Region | | [eks\_cluster\_id](#output\_eks\_cluster\_id) | EKS Cluster Id | | [eks\_cluster\_version](#output\_eks\_cluster\_version) | EKS Cluster version | | [managed\_prometheus\_workspace\_endpoint](#output\_managed\_prometheus\_workspace\_endpoint) | Amazon Managed Prometheus workspace endpoint | | [managed\_prometheus\_workspace\_id](#output\_managed\_prometheus\_workspace\_id) | Amazon Managed Prometheus workspace ID | +| [managed\_prometheus\_workspace\_region](#output\_managed\_prometheus\_workspace\_region) | AWS Region | diff --git a/examples/existing-cluster-nginx/main.tf b/examples/existing-cluster-nginx/main.tf index 987e18b1..659dbb21 100644 --- a/examples/existing-cluster-nginx/main.tf +++ b/examples/existing-cluster-nginx/main.tf @@ -10,6 +10,10 @@ data "aws_eks_cluster" "this" { name = var.eks_cluster_id } +data "aws_grafana_workspace" "this" { + workspace_id = var.managed_grafana_workspace_id +} + provider "kubernetes" { host = local.eks_cluster_endpoint cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data) @@ -34,24 +38,6 @@ locals { } } -module "aws_observability_accelerator" { - source = "../../" - # source = "github.com/aws-observability/terraform-aws-observability-accelerator?ref=v2.0.0" - - aws_region = var.aws_region - - # creates a new AMP workspace, defaults to true - enable_managed_prometheus = local.create_new_workspace - - # reusing existing AMP if specified - managed_prometheus_workspace_id = var.managed_prometheus_workspace_id - - # reusing existing Amazon Managed Grafana workspace - managed_grafana_workspace_id = var.managed_grafana_workspace_id - - tags = local.tags -} - module "eks_monitoring" { source = "../../modules/eks-monitoring" # source = "github.com/aws-observability/terraform-aws-observability-accelerator//modules/eks-monitoring?ref=v2.0.0" @@ -66,21 +52,15 @@ module "eks_monitoring" { grafana_api_key = var.grafana_api_key target_secret_name = "grafana-admin-credentials" target_secret_namespace = "grafana-operator" - grafana_url = module.aws_observability_accelerator.managed_grafana_workspace_endpoint + grafana_url = "https://${data.aws_grafana_workspace.this.endpoint}" # control the publishing of dashboards by specifying the boolean value for the variable 'enable_dashboards', default is 'true' enable_dashboards = var.enable_dashboards - managed_prometheus_workspace_id = module.aws_observability_accelerator.managed_prometheus_workspace_id - - managed_prometheus_workspace_endpoint = module.aws_observability_accelerator.managed_prometheus_workspace_endpoint - managed_prometheus_workspace_region = module.aws_observability_accelerator.managed_prometheus_workspace_region + enable_managed_prometheus = local.create_new_workspace + managed_prometheus_workspace_id = var.managed_prometheus_workspace_id enable_logs = true tags = local.tags - - depends_on = [ - module.aws_observability_accelerator - ] } diff --git a/examples/existing-cluster-nginx/outputs.tf b/examples/existing-cluster-nginx/outputs.tf index ad1c3405..e14427e3 100644 --- a/examples/existing-cluster-nginx/outputs.tf +++ b/examples/existing-cluster-nginx/outputs.tf @@ -1,16 +1,16 @@ -output "aws_region" { +output "managed_prometheus_workspace_region" { description = "AWS Region" - value = module.aws_observability_accelerator.aws_region + value = module.eks_monitoring.managed_prometheus_workspace_region } output "managed_prometheus_workspace_endpoint" { description = "Amazon Managed Prometheus workspace endpoint" - value = module.aws_observability_accelerator.managed_prometheus_workspace_endpoint + value = module.eks_monitoring.managed_prometheus_workspace_endpoint } output "managed_prometheus_workspace_id" { description = "Amazon Managed Prometheus workspace ID" - value = module.aws_observability_accelerator.managed_prometheus_workspace_id + value = module.eks_monitoring.managed_prometheus_workspace_id } output "eks_cluster_version" { diff --git a/examples/existing-cluster-with-base-and-infra/README.md b/examples/existing-cluster-with-base-and-infra/README.md index 3b06c870..a2a22630 100644 --- a/examples/existing-cluster-with-base-and-infra/README.md +++ b/examples/existing-cluster-with-base-and-infra/README.md @@ -1,4 +1,4 @@ -# Existing Cluster with the AWS Observability accelerator base module and Infrastructure monitoring +# Existing Cluster with the AWS Observability accelerator EKS Infrastructure monitoring This example demonstrates how to use the AWS Observability Accelerator Terraform modules with Infrastructure monitoring enabled. @@ -35,7 +35,6 @@ View the full documentation for this example [here](https://aws-observability.gi | Name | Source | Version | |------|--------|---------| -| [aws\_observability\_accelerator](#module\_aws\_observability\_accelerator) | ../../ | n/a | | [eks\_monitoring](#module\_eks\_monitoring) | ../../modules/eks-monitoring | n/a | ## Resources @@ -44,6 +43,7 @@ View the full documentation for this example [here](https://aws-observability.gi |------|------| | [aws_eks_cluster.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/eks_cluster) | data source | | [aws_eks_cluster_auth.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/eks_cluster_auth) | data source | +| [aws_grafana_workspace.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/grafana_workspace) | data source | ## Inputs @@ -60,7 +60,9 @@ View the full documentation for this example [here](https://aws-observability.gi | Name | Description | |------|-------------| -| [aws\_region](#output\_aws\_region) | AWS Region | | [eks\_cluster\_id](#output\_eks\_cluster\_id) | EKS Cluster Id | | [eks\_cluster\_version](#output\_eks\_cluster\_version) | EKS Cluster version | +| [managed\_prometheus\_workspace\_endpoint](#output\_managed\_prometheus\_workspace\_endpoint) | Amazon Managed Prometheus workspace endpoint | +| [managed\_prometheus\_workspace\_id](#output\_managed\_prometheus\_workspace\_id) | Amazon Managed Prometheus workspace ID | +| [managed\_prometheus\_workspace\_region](#output\_managed\_prometheus\_workspace\_region) | AWS Region | diff --git a/examples/existing-cluster-with-base-and-infra/main.tf b/examples/existing-cluster-with-base-and-infra/main.tf index eccd94a1..a00df02f 100644 --- a/examples/existing-cluster-with-base-and-infra/main.tf +++ b/examples/existing-cluster-with-base-and-infra/main.tf @@ -10,6 +10,10 @@ data "aws_eks_cluster" "this" { name = var.eks_cluster_id } +data "aws_grafana_workspace" "this" { + workspace_id = var.managed_grafana_workspace_id +} + provider "kubernetes" { host = local.eks_cluster_endpoint cluster_ca_certificate = base64decode(data.aws_eks_cluster.this.certificate_authority[0].data) @@ -33,28 +37,6 @@ locals { } } -# deploys the base module -module "aws_observability_accelerator" { - source = "../../" - # source = "github.com/aws-observability/terraform-aws-observability-accelerator?ref=v2.0.0" - - aws_region = var.aws_region - - # creates a new Amazon Managed Prometheus workspace, defaults to true - enable_managed_prometheus = local.create_new_workspace - - # reusing existing Amazon Managed Prometheus if specified - managed_prometheus_workspace_id = var.managed_prometheus_workspace_id - - # sets up the Amazon Managed Prometheus alert manager at the workspace level - enable_alertmanager = true - - # reusing existing Amazon Managed Grafana workspace - managed_grafana_workspace_id = var.managed_grafana_workspace_id - - tags = local.tags -} - module "eks_monitoring" { source = "../../modules/eks-monitoring" # source = "github.com/aws-observability/terraform-aws-observability-accelerator//modules/eks-monitoring?ref=v2.0.0" @@ -75,15 +57,17 @@ module "eks_monitoring" { grafana_api_key = var.grafana_api_key target_secret_name = "grafana-admin-credentials" target_secret_namespace = "grafana-operator" - grafana_url = module.aws_observability_accelerator.managed_grafana_workspace_endpoint + grafana_url = "https://${data.aws_grafana_workspace.this.endpoint}" # control the publishing of dashboards by specifying the boolean value for the variable 'enable_dashboards', default is 'true' enable_dashboards = var.enable_dashboards - managed_prometheus_workspace_id = module.aws_observability_accelerator.managed_prometheus_workspace_id + # creates a new Amazon Managed Prometheus workspace, defaults to true + enable_managed_prometheus = local.create_new_workspace + managed_prometheus_workspace_id = var.managed_prometheus_workspace_id - managed_prometheus_workspace_endpoint = module.aws_observability_accelerator.managed_prometheus_workspace_endpoint - managed_prometheus_workspace_region = module.aws_observability_accelerator.managed_prometheus_workspace_region + # sets up the Amazon Managed Prometheus alert manager at the workspace level + enable_alertmanager = true # optional, defaults to 60s interval and 15s timeout prometheus_config = { @@ -94,8 +78,4 @@ module "eks_monitoring" { enable_logs = true tags = local.tags - - depends_on = [ - module.aws_observability_accelerator - ] } diff --git a/examples/existing-cluster-with-base-and-infra/outputs.tf b/examples/existing-cluster-with-base-and-infra/outputs.tf index f8b4d584..e14427e3 100644 --- a/examples/existing-cluster-with-base-and-infra/outputs.tf +++ b/examples/existing-cluster-with-base-and-infra/outputs.tf @@ -1,6 +1,16 @@ -output "aws_region" { +output "managed_prometheus_workspace_region" { description = "AWS Region" - value = module.aws_observability_accelerator.aws_region + value = module.eks_monitoring.managed_prometheus_workspace_region +} + +output "managed_prometheus_workspace_endpoint" { + description = "Amazon Managed Prometheus workspace endpoint" + value = module.eks_monitoring.managed_prometheus_workspace_endpoint +} + +output "managed_prometheus_workspace_id" { + description = "Amazon Managed Prometheus workspace ID" + value = module.eks_monitoring.managed_prometheus_workspace_id } output "eks_cluster_version" { diff --git a/locals.tf b/locals.tf deleted file mode 100644 index 8e823659..00000000 --- a/locals.tf +++ /dev/null @@ -1,17 +0,0 @@ -data "aws_region" "current" {} - -data "aws_grafana_workspace" "this" { - workspace_id = var.managed_grafana_workspace_id -} - - -locals { - # if region is not passed, we assume the current one - amp_ws_region = coalesce(var.managed_prometheus_workspace_region, data.aws_region.current.name) - amp_ws_id = var.enable_managed_prometheus ? aws_prometheus_workspace.this[0].id : var.managed_prometheus_workspace_id - amp_ws_endpoint = "https://aps-workspaces.${local.amp_ws_region}.amazonaws.com/workspaces/${local.amp_ws_id}/" - - amg_ws_endpoint = "https://${data.aws_grafana_workspace.this.endpoint}" - - name = "aws-observability-accelerator" -} diff --git a/main.tf b/main.tf deleted file mode 100644 index d7e7ae62..00000000 --- a/main.tf +++ /dev/null @@ -1,20 +0,0 @@ -resource "aws_prometheus_workspace" "this" { - count = var.enable_managed_prometheus ? 1 : 0 - - alias = local.name - tags = var.tags -} - -resource "aws_prometheus_alert_manager_definition" "this" { - count = var.enable_alertmanager ? 1 : 0 - - workspace_id = local.amp_ws_id - - definition = < [eks\_cluster\_id](#input\_eks\_cluster\_id) | EKS Cluster Id | `string` | n/a | yes | | [enable\_adotcollector\_metrics](#input\_enable\_adotcollector\_metrics) | Enables collection of ADOT collector metrics | `bool` | `true` | no | | [enable\_alerting\_rules](#input\_enable\_alerting\_rules) | Enables or disables Managed Prometheus alerting rules | `bool` | `true` | no | +| [enable\_alertmanager](#input\_enable\_alertmanager) | Creates Amazon Managed Service for Prometheus AlertManager for all workloads | `bool` | `false` | no | | [enable\_amazon\_eks\_adot](#input\_enable\_amazon\_eks\_adot) | Enables the ADOT Operator on the EKS Cluster | `bool` | `true` | no | | [enable\_apiserver\_monitoring](#input\_enable\_apiserver\_monitoring) | Enable EKS kube-apiserver monitoring, alerting and dashboards | `bool` | `true` | no | | [enable\_cert\_manager](#input\_enable\_cert\_manager) | Allow reusing an existing installation of cert-manager | `bool` | `true` | no | @@ -88,6 +90,7 @@ See examples using this Terraform modules in the **Amazon EKS** section of [this | [enable\_java](#input\_enable\_java) | Enable Java workloads monitoring, alerting and default dashboards | `bool` | `false` | no | | [enable\_kube\_state\_metrics](#input\_enable\_kube\_state\_metrics) | Enables or disables Kube State metrics exporter. Disabling this might affect some data in the dashboards | `bool` | `true` | no | | [enable\_logs](#input\_enable\_logs) | Using AWS For FluentBit to collect cluster and application logs to Amazon CloudWatch | `bool` | `true` | no | +| [enable\_managed\_prometheus](#input\_enable\_managed\_prometheus) | Creates a new Amazon Managed Service for Prometheus Workspace | `bool` | `true` | no | | [enable\_nginx](#input\_enable\_nginx) | Enable NGINX workloads monitoring, alerting and default dashboards | `bool` | `false` | no | | [enable\_node\_exporter](#input\_enable\_node\_exporter) | Enables or disables Node exporter. Disabling this might affect some data in the dashboards | `bool` | `true` | no | | [enable\_recording\_rules](#input\_enable\_recording\_rules) | Enables or disables Managed Prometheus recording rules | `bool` | `true` | no | @@ -137,4 +140,7 @@ See examples using this Terraform modules in the **Amazon EKS** section of [this | [adot\_irsa\_arn](#output\_adot\_irsa\_arn) | IRSA Arn for ADOT | | [eks\_cluster\_id](#output\_eks\_cluster\_id) | EKS Cluster Id | | [eks\_cluster\_version](#output\_eks\_cluster\_version) | EKS Cluster version | +| [managed\_prometheus\_workspace\_endpoint](#output\_managed\_prometheus\_workspace\_endpoint) | Amazon Managed Prometheus workspace endpoint | +| [managed\_prometheus\_workspace\_id](#output\_managed\_prometheus\_workspace\_id) | Amazon Managed Prometheus workspace ID | +| [managed\_prometheus\_workspace\_region](#output\_managed\_prometheus\_workspace\_region) | Amazon Managed Prometheus workspace region | diff --git a/modules/eks-monitoring/alerts.tf b/modules/eks-monitoring/alerts.tf index 1cae59b6..03740bdb 100644 --- a/modules/eks-monitoring/alerts.tf +++ b/modules/eks-monitoring/alerts.tf @@ -6,7 +6,7 @@ resource "aws_prometheus_rule_group_namespace" "alerting_rules" { count = var.enable_alerting_rules ? 1 : 0 name = "accelerator-infra-alerting" - workspace_id = var.managed_prometheus_workspace_id + workspace_id = local.managed_prometheus_workspace_id data = < Date: Tue, 23 Jan 2024 00:44:50 +0100 Subject: [PATCH 3/5] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c35e0081..b0ba7c23 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ To explore the complete project documentation, please visit our [documentation s ## ⚠️ Dropping base module -Starting in v2.12.0, we have removed the [base module](https://github.com/aws-observability/terraform-aws-observability-accelerator/blob/v2.10.3/docs/concepts.md#base-module) +Starting in v2.11.0, we have removed the [base module](https://github.com/aws-observability/terraform-aws-observability-accelerator/blob/v2.10.3/docs/concepts.md#base-module) which previously served as a glue between modules. However, the modules were quite independent and the base module provided limited functionality. From d8b30678a3f862f16f848f76a5e3522f6cc4a8d3 Mon Sep 17 00:00:00 2001 From: Rodrigue Koffi Date: Tue, 23 Jan 2024 00:45:53 +0100 Subject: [PATCH 4/5] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b0ba7c23..c35e0081 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ To explore the complete project documentation, please visit our [documentation s ## ⚠️ Dropping base module -Starting in v2.11.0, we have removed the [base module](https://github.com/aws-observability/terraform-aws-observability-accelerator/blob/v2.10.3/docs/concepts.md#base-module) +Starting in v2.12.0, we have removed the [base module](https://github.com/aws-observability/terraform-aws-observability-accelerator/blob/v2.10.3/docs/concepts.md#base-module) which previously served as a glue between modules. However, the modules were quite independent and the base module provided limited functionality. From ada16d59655d3640abc699d77ba8a8131d6f0a43 Mon Sep 17 00:00:00 2001 From: Kevin Lewin <97046295+lewinkedrs@users.noreply.github.com> Date: Wed, 24 Jan 2024 12:09:52 -0500 Subject: [PATCH 5/5] feat(module: eks-monitoring) Add NVIDIA gpu monitoring dashboards (#257) * gpu dashboards * fixing locals * doc start * Update gpumon.md * fixing typos and doc names * fixing module name * fixing mkdocs * gpu to nvidia * Apply pre-commit --------- Co-authored-by: Rodrigue Koffi --- docs/eks/gpu-monitoring.md | 38 ++++++++++++++++++++++++++++ mkdocs.yml | 1 + modules/eks-monitoring/README.md | 3 +++ modules/eks-monitoring/dashboards.tf | 20 +++++++++++++++ modules/eks-monitoring/locals.tf | 9 +++++++ modules/eks-monitoring/main.tf | 4 +++ modules/eks-monitoring/variables.tf | 20 +++++++++++++++ 7 files changed, 95 insertions(+) create mode 100644 docs/eks/gpu-monitoring.md diff --git a/docs/eks/gpu-monitoring.md b/docs/eks/gpu-monitoring.md new file mode 100644 index 00000000..8514654d --- /dev/null +++ b/docs/eks/gpu-monitoring.md @@ -0,0 +1,38 @@ +# Monitoring NVIDIA GPU Workloads + +GPUs play an integral part in data intensive workloads. The eks-monitoring module of the Observability Accelerator provides the ability to deploy the NVIDIA DCGM Exporter Dashboard. +The dashboard utilizes metrics scraped from the `/metrics` endpoint that are exposed when running the nvidia gpu operator with the [DCGM exporter](https://developer.nvidia.com/blog/monitoring-gpus-in-kubernetes-with-dcgm/) and NVSMI binary. + +!!!note + In order to make use of this dashboard, you will need to have a GPU backed EKS cluster and deploy the [GPU operator](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/amazon-eks.html) + The recommended way of deploying the GPU operator is the [Data on EKS Blueprint](https://github.com/aws-ia/terraform-aws-eks-data-addons/blob/main/nvidia-gpu-operator.tf) + +## Deployment + +This is enabled by default in the [eks-monitoring module](https://aws-observability.github.io/terraform-aws-observability-accelerator/eks/). + +## Dashboards + +In order to start producing diagnostic metrics you must first deploy the nvidia SMI binary. nvidia-smi (also NVSMI) provides monitoring and management capabilities for each of NVIDIA’s devices from Fermi and higher architecture families. We can now deploy the nvidia-smi binary, which shows diagnostic information about all GPUs visible to the container: + +``` +cat << EOF | kubectl apply -f - +apiVersion: v1 +kind: Pod +metadata: + name: nvidia-smi +spec: + restartPolicy: OnFailure + containers: + - name: nvidia-smi + image: "nvidia/cuda:11.0.3-base-ubuntu20.04" + args: + - "nvidia-smi" + resources: + limits: + nvidia.com/gpu: 1 +EOF +``` +After producing the metrics they should populate the DCGM exporter dashboard: + +![image](https://github.com/aws-observability/terraform-aws-observability-accelerator/assets/97046295/66e8ae83-3a78-48b8-a9fc-4460a5a4d173) diff --git a/mkdocs.yml b/mkdocs.yml index 918978a0..bf8594b8 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -28,6 +28,7 @@ nav: - Amazon EKS: - Infrastructure: eks/index.md - EKS API server: eks/eks-apiserver.md + - EKS GPU montitoring: eks/gpu-monitoring.md - Multicluster: - Single AWS account: eks/multicluster.md - Cross AWS account: eks/multiaccount.md diff --git a/modules/eks-monitoring/README.md b/modules/eks-monitoring/README.md index 5116546d..2899667e 100644 --- a/modules/eks-monitoring/README.md +++ b/modules/eks-monitoring/README.md @@ -61,6 +61,7 @@ See examples using this Terraform modules in the **Amazon EKS** section of [this | [kubectl_manifest.flux_gitrepository](https://registry.terraform.io/providers/alekc/kubectl/latest/docs/resources/manifest) | resource | | [kubectl_manifest.flux_kustomization](https://registry.terraform.io/providers/alekc/kubectl/latest/docs/resources/manifest) | resource | | [kubectl_manifest.kubeproxy_monitoring_dashboard](https://registry.terraform.io/providers/alekc/kubectl/latest/docs/resources/manifest) | resource | +| [kubectl_manifest.nvidia_monitoring_dashboards](https://registry.terraform.io/providers/alekc/kubectl/latest/docs/resources/manifest) | resource | | [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source | | [aws_eks_cluster.eks_cluster](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/eks_cluster) | data source | | [aws_partition.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/partition) | data source | @@ -93,6 +94,7 @@ See examples using this Terraform modules in the **Amazon EKS** section of [this | [enable\_managed\_prometheus](#input\_enable\_managed\_prometheus) | Creates a new Amazon Managed Service for Prometheus Workspace | `bool` | `true` | no | | [enable\_nginx](#input\_enable\_nginx) | Enable NGINX workloads monitoring, alerting and default dashboards | `bool` | `false` | no | | [enable\_node\_exporter](#input\_enable\_node\_exporter) | Enables or disables Node exporter. Disabling this might affect some data in the dashboards | `bool` | `true` | no | +| [enable\_nvidia\_monitoring](#input\_enable\_nvidia\_monitoring) | Enables monitoring of nvidia metrics | `bool` | `true` | no | | [enable\_recording\_rules](#input\_enable\_recording\_rules) | Enables or disables Managed Prometheus recording rules | `bool` | `true` | no | | [enable\_tracing](#input\_enable\_tracing) | Enables tracing with OTLP traces receiver to X-Ray | `bool` | `true` | no | | [flux\_config](#input\_flux\_config) | FluxCD configuration |
object({
create_namespace = bool
k8s_namespace = string
helm_chart_name = string
helm_chart_version = string
helm_release_name = string
helm_repo_url = string
helm_settings = map(string)
helm_values = map(any)
})
|
{
"create_namespace": true,
"helm_chart_name": "flux2",
"helm_chart_version": "2.12.2",
"helm_release_name": "observability-fluxcd-addon",
"helm_repo_url": "https://fluxcd-community.github.io/helm-charts",
"helm_settings": {},
"helm_values": {},
"k8s_namespace": "flux-system"
}
| no | @@ -127,6 +129,7 @@ See examples using this Terraform modules in the **Amazon EKS** section of [this | [managed\_prometheus\_workspace\_region](#input\_managed\_prometheus\_workspace\_region) | Amazon Managed Prometheus Workspace's Region | `string` | `null` | no | | [ne\_config](#input\_ne\_config) | Node exporter configuration |
object({
create_namespace = bool
k8s_namespace = string
helm_chart_name = string
helm_chart_version = string
helm_release_name = string
helm_repo_url = string
helm_settings = map(string)
helm_values = map(any)

scrape_interval = string
scrape_timeout = string
})
|
{
"create_namespace": true,
"helm_chart_name": "prometheus-node-exporter",
"helm_chart_version": "4.24.0",
"helm_release_name": "prometheus-node-exporter",
"helm_repo_url": "https://prometheus-community.github.io/helm-charts",
"helm_settings": {},
"helm_values": {},
"k8s_namespace": "prometheus-node-exporter",
"scrape_interval": "60s",
"scrape_timeout": "60s"
}
| no | | [nginx\_config](#input\_nginx\_config) | Configuration object for NGINX monitoring |
object({
enable_alerting_rules = bool
enable_recording_rules = bool
enable_dashboards = bool
scrape_sample_limit = number

flux_gitrepository_name = string
flux_gitrepository_url = string
flux_gitrepository_branch = string
flux_kustomization_name = string
flux_kustomization_path = string

grafana_dashboard_url = string

prometheus_metrics_endpoint = string
})
| `null` | no | +| [nvidia\_monitoring\_config](#input\_nvidia\_monitoring\_config) | Config object for nvidia monitoring |
object({
flux_gitrepository_name = string
flux_gitrepository_url = string
flux_gitrepository_branch = string
flux_kustomization_name = string
flux_kustomization_path = string
})
| `null` | no | | [prometheus\_config](#input\_prometheus\_config) | Controls default values such as scrape interval, timeouts and ports globally |
object({
global_scrape_interval = string
global_scrape_timeout = string
})
|
{
"global_scrape_interval": "120s",
"global_scrape_timeout": "15s"
}
| no | | [tags](#input\_tags) | Additional tags (e.g. `map('BusinessUnit`,`XYZ`) | `map(string)` | `{}` | no | | [target\_secret\_name](#input\_target\_secret\_name) | Target secret in Kubernetes to store the Grafana API Key Secret | `string` | `"grafana-admin-credentials"` | no | diff --git a/modules/eks-monitoring/dashboards.tf b/modules/eks-monitoring/dashboards.tf index 6ad13f33..d130e80d 100644 --- a/modules/eks-monitoring/dashboards.tf +++ b/modules/eks-monitoring/dashboards.tf @@ -95,6 +95,26 @@ YAML depends_on = [module.external_secrets] } +# nvidia dashboards +resource "kubectl_manifest" "nvidia_monitoring_dashboards" { + yaml_body = <