From 30b06debd0c7746348fc68e32e830e6984c0df5b Mon Sep 17 00:00:00 2001 From: Elamaran Shanmugam Date: Tue, 23 Jan 2024 23:15:24 -0500 Subject: [PATCH 1/7] Istio Monitoring Pattern --- cdk.json | 19 ++- .../amp-config/istio/alerting-rules.yml | 113 ++++++++++++++++++ .../amp-config/istio/recording-rules.yml | 59 +++++++++ .../index.ts | 35 ++++++ 4 files changed, 216 insertions(+), 10 deletions(-) create mode 100644 lib/common/resources/amp-config/istio/alerting-rules.yml create mode 100644 lib/common/resources/amp-config/istio/recording-rules.yml diff --git a/cdk.json b/cdk.json index dc36022b..e9672526 100644 --- a/cdk.json +++ b/cdk.json @@ -32,22 +32,21 @@ "GRAFANA_NSWRKLDS_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/namespace-workloads.json", "GRAFANA_NODEEXP_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/nodeexporter-nodes.json", "GRAFANA_NODES_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/nodes.json", - "GRAFANA_WORKLOADS_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/workloads.json" + "GRAFANA_WORKLOADS_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/workloads.json", + "GRAFANA_ISTIO_CP_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/v0.2.0/artifacts/grafana-dashboards/eks/istio/istio-control-plane-dashboard.json", + "GRAFANA_ISTIO_MESH_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/v0.2.0/artifacts/grafana-dashboards/eks/istio/istio-mesh-dashboard.json", + "GRAFANA_ISTIO_PERF_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/v0.2.0/artifacts/grafana-dashboards/eks/istio/istio-performance-dashboard.json", + "GRAFANA_ISTIO_SERVICE_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/v0.2.0/artifacts/grafana-dashboards/eks/istio/istio-service-dashboard.json" }, "kustomizations": [ { "kustomizationPath": "./artifacts/grafana-operator-manifests/eks/infrastructure" + }, + { + "kustomizationPath": "./artifacts/grafana-operator-manifests/eks/istio" } ] }, - "gpuNodeGroup": { - "instanceType": "g4dn.xlarge", - "desiredSize": 2, - "minSize": 2, - "maxSize": 3, - "ebsSize": 50 - }, - "existing.cluster.name": "single-new-eks-observability-accelerator", - "existing.kubectl.rolename": "YOUR_KUBECTL_ROLE" + "istio.pattern.enabled": true } } \ No newline at end of file diff --git a/lib/common/resources/amp-config/istio/alerting-rules.yml b/lib/common/resources/amp-config/istio/alerting-rules.yml new file mode 100644 index 00000000..ef9f7fcd --- /dev/null +++ b/lib/common/resources/amp-config/istio/alerting-rules.yml @@ -0,0 +1,113 @@ + groups: + - name: "istio.basic.alerting-rules" + rules: + - alert: IngressTrafficMissing + annotations: + summary: 'ingress gateway traffic missing' + description: '[Critical]: ingress gateway traffic missing, likely other monitors are misleading, check client logs' + expr: > + absent(istio_requests_total{destination_service_namespace=~"service-graph.*",reporter="source",source_workload="istio-ingressgateway"})==1 + for: 5m + - alert: IstioMetricsMissing + annotations: + summary: 'Istio Metrics missing' + description: '[Critical]: Check prometheus deployment or whether the prometheus filters are applied correctly' + expr: > + absent(istio_request_total)==1 or absent(istio_request_duration_milliseconds_bucket)==1 + for: 5m + - name: "istio.workload.alerting-rules" + rules: + - alert: HTTP5xxRateHigh + annotations: + summary: '5xx rate too high' + description: 'The HTTP 5xx errors rate higher than 0.05 in 5 mins' + expr: > + sum(irate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(irate(istio_requests_total{reporter="destination"}[5m])) > 0.05 + for: 5m + - alert: WorkloadLatencyP99High + expr: histogram_quantile(0.99, sum(irate(istio_request_duration_milliseconds_bucket{source_workload=~"svc.*"}[5m])) by (source_workload,namespace, le)) > 160 + for: 10m + annotations: + description: 'The workload request latency P99 > 160ms ' + message: "Request duration has slowed down for workload: {{`{{$labels.source_workload}}`}} in namespace: {{`{{$labels.namespace}}`}}. Response duration is {{`{{$value}}`}} milliseconds" + - alert: IngressLatencyP99High + expr: histogram_quantile(0.99, sum(irate(istio_request_duration_milliseconds_bucket{source_workload=~"istio.*"}[5m])) by (source_workload,namespace, le)) > 250 + for: 10m + annotations: + description: 'The ingress latency P99 > 250ms ' + message: "Request duration has slowed down for ingress: {{`{{$labels.source_workload}}`}} in namespace: {{`{{$labels.namespace}}`}}. Response duration is {{`{{$value}}`}} milliseconds" + - name: "istio.infra.alerting-rules" + rules: + - alert: ProxyContainerCPUUsageHigh + expr: (sum(rate(container_cpu_usage_seconds_total{namespace!="kube-system", container=~"istio-proxy", namespace!=""}[5m])) BY (namespace, pod, container) * 100) > 80 + for: 5m + annotations: + summary: "Proxy Container CPU usage (namespace {{ $labels.namespace }}) (pod {{ $labels.pod }}) (container {{ $labels.container }}) VALUE = {{ $value }}\n" + description: "Proxy Container CPU usage is above 80%" + - alert: ProxyContainerMemoryUsageHigh + expr: (sum(container_memory_working_set_bytes{namespace!="kube-system", container=~"istio-proxy", namespace!=""}) BY (container, pod, namespace) / (sum(container_spec_memory_limit_bytes{namespace!="kube-system", container!="POD"}) BY (container, pod, namespace) > 0)* 100) > 80 + for: 5m + annotations: + summary: "Proxy Container Memory usage (namespace {{ $labels.namespace }}) (pod {{ $labels.pod }}) (container {{ $labels.container }}) VALUE = {{ $value }}\n" + description: "Proxy Container Memory usage is above 80%" + - alert: IngressMemoryUsageIncreaseRateHigh + expr: avg(deriv(container_memory_working_set_bytes{container=~"istio-proxy",namespace="istio-system"}[60m])) > 200 + for: 180m + annotations: + summary: "Ingress proxy Memory change rate, VALUE = {{ $value }}\n" + description: "Ingress proxy Memory Usage increases more than 200 Bytes/sec" + - alert: IstiodContainerCPUUsageHigh + expr: (sum(rate(container_cpu_usage_seconds_total{namespace="istio-system", container="discovery"}[5m])) BY (pod) * 100) > 80 + for: 5m + annotations: + summary: "Istiod Container CPU usage (namespace {{ $labels.namespace }}) (pod {{ $labels.pod }}) (container {{ $labels.container }}) VALUE = {{ $value }}\n" + description: "Isitod Container CPU usage is above 80%" + - alert: IstiodMemoryUsageHigh + expr: (sum(container_memory_working_set_bytes{namespace="istio-system", container="discovery"}) BY (pod) / (sum(container_spec_memory_limit_bytes{namespace="istio-system", container="discovery"}) BY (pod) > 0)* 100) > 80 + for: 5m + annotations: + summary: "Istiod Container Memory usage (namespace {{ $labels.namespace }}) (pod {{ $labels.pod }}) (container {{ $labels.container }}) VALUE = {{ $value }}\n" + description: "Istiod Container Memory usage is above 80%" + - alert: IstiodMemoryUsageIncreaseRateHigh + expr: sum(deriv(container_memory_working_set_bytes{namespace="istio-system",pod=~"istiod-.*"}[60m])) > 1000 + for: 300m + annotations: + summary: "Istiod Container Memory usage increase rate high, VALUE = {{ $value }}\n" + description: "Istiod Container Memory usage increases more than 1k Bytes/sec" + - name: "istio.controlplane.alerting-rules" + rules: + - alert: IstiodxdsPushErrorsHigh + annotations: + summary: 'istiod push errors is too high' + description: 'istiod push error rate is higher than 0.05' + expr: > + sum(irate(pilot_xds_push_errors{app="istiod"}[5m])) / sum(irate(pilot_xds_pushes{app="istiod"}[5m])) > 0.05 + for: 5m + - alert: IstiodxdsRejectHigh + annotations: + summary: 'istiod rejects rate is too high' + description: 'istiod rejects rate is higher than 0.05' + expr: > + sum(irate(pilot_total_xds_rejects{app="istiod"}[5m])) / sum(irate(pilot_xds_pushes{app="istiod"}[5m])) > 0.05 + for: 5m + - alert: IstiodContainerNotReady + annotations: + summary: 'istiod container not ready' + description: 'container: discovery not running' + expr: > + kube_pod_container_status_running{namespace="istio-system", container="discovery", component=""} == 0 + for: 5m + - alert: IstiodUnavailableReplica + annotations: + summary: 'Istiod unavailable pod' + description: 'Istiod unavailable replica > 0' + expr: > + kube_deployment_status_replicas_unavailable{deployment="istiod", component=""} > 0 + for: 5m + - alert: Ingress200RateLow + annotations: + summary: 'ingress gateway 200 rate drops' + description: 'The expected rate is 100 per ns, the limit is set based on 15ns' + expr: > + sum(rate(istio_requests_total{reporter="source", source_workload="istio-ingressgateway",response_code="200",destination_service_namespace=~"service-graph.*"}[5m])) < 1490 + for: 30m \ No newline at end of file diff --git a/lib/common/resources/amp-config/istio/recording-rules.yml b/lib/common/resources/amp-config/istio/recording-rules.yml new file mode 100644 index 00000000..c2908934 --- /dev/null +++ b/lib/common/resources/amp-config/istio/recording-rules.yml @@ -0,0 +1,59 @@ + groups: + - name: "istio.recording-rules" + interval: 5s + rules: + - record: "workload:istio_requests_total" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_requests_total) + + - record: "workload:istio_request_duration_milliseconds_count" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_duration_milliseconds_count) + + - record: "workload:istio_request_duration_milliseconds_sum" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_duration_milliseconds_sum) + + - record: "workload:istio_request_duration_milliseconds_bucket" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_duration_milliseconds_bucket) + + - record: "workload:istio_request_bytes_count" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_bytes_count) + + - record: "workload:istio_request_bytes_sum" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_bytes_sum) + + - record: "workload:istio_request_bytes_bucket" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_request_bytes_bucket) + + - record: "workload:istio_response_bytes_count" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_response_bytes_count) + + - record: "workload:istio_response_bytes_sum" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_response_bytes_sum) + + - record: "workload:istio_response_bytes_bucket" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_response_bytes_bucket) + + - record: "workload:istio_tcp_sent_bytes_total" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_tcp_sent_bytes_total) + + - record: "workload:istio_tcp_received_bytes_total" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_tcp_received_bytes_total) + + - record: "workload:istio_tcp_connections_opened_total" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_tcp_connections_opened_total) + + - record: "workload:istio_tcp_connections_closed_total" + expr: | + sum without(instance, kubernetes_namespace, kubernetes_pod_name) (istio_tcp_connections_closed_total) \ No newline at end of file diff --git a/lib/single-new-eks-opensource-observability-pattern/index.ts b/lib/single-new-eks-opensource-observability-pattern/index.ts index f03ccc51..0fac6d1d 100644 --- a/lib/single-new-eks-opensource-observability-pattern/index.ts +++ b/lib/single-new-eks-opensource-observability-pattern/index.ts @@ -2,6 +2,8 @@ import { Construct } from 'constructs'; import { utils } from '@aws-quickstart/eks-blueprints'; import * as blueprints from '@aws-quickstart/eks-blueprints'; import { GrafanaOperatorSecretAddon } from './grafanaoperatorsecretaddon'; +import * as eks from 'aws-cdk-lib/aws-eks'; +import * as ec2 from 'aws-cdk-lib/aws-ec2'; import * as amp from 'aws-cdk-lib/aws-aps'; import { ObservabilityBuilder } from '@aws-quickstart/eks-blueprints'; import * as fs from 'fs'; @@ -97,6 +99,20 @@ export default class SingleNewEksOpenSourceobservabilityPattern { ); } + if (utils.valueFromContext(scope, "istio.pattern.enabled", false)) { + ampAddOnProps.openTelemetryCollector = { + manifestPath: __dirname + '/../common/resources/otel-collector-config-new.yml', + manifestParameterMap: { + javaScrapeSampleLimit: 1000, + javaPrometheusMetricsEndpoint: "/metrics" + } + }; + ampAddOnProps.ampRules?.ruleFilePaths.push( + __dirname + '/../common/resources/amp-config/istio/alerting-rules.yml', + __dirname + '/../common/resources/amp-config/istio/recording-rules.yml' + ); + } + Reflect.defineMetadata("ordered", true, blueprints.addons.GrafanaOperatorAddon); const addOns: Array = [ new blueprints.addons.CloudWatchLogsAddon({ @@ -108,9 +124,28 @@ export default class SingleNewEksOpenSourceobservabilityPattern { new GrafanaOperatorSecretAddon() ]; + if (utils.valueFromContext(scope, "istio.pattern.enabled", false)) { + const istioControlPlaneAddOnProps = { + version: "1.18.2", + } + addOns.push(new blueprints.addons.IstioBaseAddOn({ + version: "1.18.2" + })); + addOns.push(new blueprints.addons.IstioControlPlaneAddOn(istioControlPlaneAddOnProps)); + } + + const mngProps: blueprints.MngClusterProviderProps = { + version: eks.KubernetesVersion.of("1.28"), + instanceTypes: [new ec2.InstanceType("m5.2xlarge")], + amiType: eks.NodegroupAmiType.AL2_X86_64, + desiredSize: 2, + maxSize: 3, + }; + ObservabilityBuilder.builder() .account(account) .region(region) + .clusterProvider(new blueprints.MngClusterProvider(mngProps)) .resourceProvider(ampWorkspaceName, new blueprints.CreateAmpProvider(ampWorkspaceName, ampWorkspaceName)) .version('auto') .withAmpProps(ampAddOnProps) From 3eda08bfb8acfb0657d4150b1d1ddc01c13e28af Mon Sep 17 00:00:00 2001 From: Elamaran Shanmugam Date: Tue, 23 Jan 2024 23:19:41 -0500 Subject: [PATCH 2/7] Istio Monitoring Pattern --- lib/single-new-eks-opensource-observability-pattern/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/single-new-eks-opensource-observability-pattern/index.ts b/lib/single-new-eks-opensource-observability-pattern/index.ts index 0fac6d1d..8e86a976 100644 --- a/lib/single-new-eks-opensource-observability-pattern/index.ts +++ b/lib/single-new-eks-opensource-observability-pattern/index.ts @@ -127,7 +127,7 @@ export default class SingleNewEksOpenSourceobservabilityPattern { if (utils.valueFromContext(scope, "istio.pattern.enabled", false)) { const istioControlPlaneAddOnProps = { version: "1.18.2", - } + }; addOns.push(new blueprints.addons.IstioBaseAddOn({ version: "1.18.2" })); From 3a971a2004c594997a6969006c2c8dbccc80cb5c Mon Sep 17 00:00:00 2001 From: Elamaran Shanmugam Date: Tue, 23 Jan 2024 23:31:43 -0500 Subject: [PATCH 3/7] Istio Monitoring Pattern --- cdk.json | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/cdk.json b/cdk.json index e9672526..c9940b54 100644 --- a/cdk.json +++ b/cdk.json @@ -47,6 +47,15 @@ } ] }, - "istio.pattern.enabled": true + "istio.pattern.enabled": true, + "gpuNodeGroup": { + "instanceType": "g4dn.xlarge", + "desiredSize": 2, + "minSize": 2, + "maxSize": 3, + "ebsSize": 50 + }, + "existing.cluster.name": "single-new-eks-observability-accelerator", + "existing.kubectl.rolename": "YOUR_KUBECTL_ROLE" } } \ No newline at end of file From 549d82a9e464f28ee65ffb14a68f97ca8c77c27a Mon Sep 17 00:00:00 2001 From: Elamaran Shanmugam Date: Wed, 24 Jan 2024 19:02:49 -0500 Subject: [PATCH 4/7] Istio Monitoring Pattern - Fixing Dash Issues --- .../resources/otel-collector-config.yml | 111 +++++++++++++++++- .../index.ts | 45 ++++--- .../istio/istioIngressGatewayAddon.ts | 26 ++++ .../istio/istiocniAddon.ts | 26 ++++ 4 files changed, 188 insertions(+), 20 deletions(-) create mode 100644 lib/single-new-eks-opensource-observability-pattern/istio/istioIngressGatewayAddon.ts create mode 100644 lib/single-new-eks-opensource-observability-pattern/istio/istiocniAddon.ts diff --git a/lib/common/resources/otel-collector-config.yml b/lib/common/resources/otel-collector-config.yml index 010dd42b..68652e9d 100644 --- a/lib/common/resources/otel-collector-config.yml +++ b/lib/common/resources/otel-collector-config.yml @@ -68,7 +68,7 @@ spec: regex: (.+) target_label: __metrics_path__ replacement: /api/v1/nodes/$${1}/proxy/metrics/cadvisor - {{ if enableAPIserverJob }} + {{ start enableAPIserverJob }} - job_name: 'apiserver' scheme: https tls_config: @@ -94,7 +94,7 @@ spec: regex: apiserver_request_duration_seconds_bucket;(0.15|0.2|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2|3|3.5|4|4.5|6|7|8|9|15|25|40|50) replacement: $1 action: drop - {{ end }} + {{ stop enableAPIserverJob }} - job_name: serviceMonitor/default/kube-prometheus-stack-prometheus-node-exporter/0 honor_timestamps: true scrape_interval: 30s @@ -1607,9 +1607,18 @@ spec: - job_name: 'node-exporter' kubernetes_sd_configs: - role: endpoints + ec2_sd_configs: + relabel_configs: + - source_labels: [ __address__ ] + action: keep + regex: '.*:9100$' + - action: replace + source_labels: [__meta_kubernetes_endpoint_node_name] + target_label: nodename + {{ start enableJavaMonJob }} - job_name: 'kubernetes-java-jmx' - sample_limit: {{javaScrapeSampleLimit}} - metrics_path: {{javaPrometheusMetricsEndpoint}} + sample_limit: {{ .Values.javaScrapeSampleLimit }} + metrics_path: {{ .Values.javaPrometheusMetricsEndpoint }} kubernetes_sd_configs: - role: pod relabel_configs: @@ -1637,6 +1646,100 @@ spec: - source_labels: [ __name__ ] regex: 'jvm_gc_collection_seconds.*' action: drop + {{ stop enableJavaMonJob }} + + {{ start enableNginxMonJob }} + - job_name: 'kubernetes-nginx' + sample_limit: {{ .Values.nginxScrapeSampleLimit }} + metrics_path: {{ .Values.nginxPrometheusMetricsEndpoint }} + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: [ __address__ ] + action: keep + regex: '.*:10254$' + - source_labels: [__meta_kubernetes_pod_container_name] + target_label: container + action: replace + - source_labels: [__meta_kubernetes_pod_node_name] + target_label: host + action: replace + - source_labels: [__meta_kubernetes_namespace] + target_label: namespace + action: replace + metric_relabel_configs: + - source_labels: [__name__] + regex: 'go_memstats.*' + action: drop + - source_labels: [__name__] + regex: 'go_gc.*' + action: drop + - source_labels: [__name__] + regex: 'go_threads' + action: drop + - regex: exported_host + action: labeldrop + {{ stop enableNginxMonJob }} + + {{ start enableIstioMonJob }} + - honor_labels: true + job_name: kubernetes-istio + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape + - action: drop + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow + - action: replace + regex: (https?) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scheme + target_label: __scheme__ + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) + replacement: '[$$2]:$$1' + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: replace + regex: (\d+);((([0-9]+?)(\.|$)){4}) + replacement: $$2:$$1 + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_port + - __meta_kubernetes_pod_ip + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) + replacement: __param_$1 + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: replace + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - action: keep + source_labels: [ __address__ ] + regex: '.*:15020$$' + - action: drop + regex: Pending|Succeeded|Failed|Completed + source_labels: + - __meta_kubernetes_pod_phase + {{ stop enableIstioMonJob }} exporters: prometheusremotewrite: endpoint: "{{remoteWriteEndpoint}}" diff --git a/lib/single-new-eks-opensource-observability-pattern/index.ts b/lib/single-new-eks-opensource-observability-pattern/index.ts index 8e86a976..13ea8142 100644 --- a/lib/single-new-eks-opensource-observability-pattern/index.ts +++ b/lib/single-new-eks-opensource-observability-pattern/index.ts @@ -7,6 +7,8 @@ import * as ec2 from 'aws-cdk-lib/aws-ec2'; import * as amp from 'aws-cdk-lib/aws-aps'; import { ObservabilityBuilder } from '@aws-quickstart/eks-blueprints'; import * as fs from 'fs'; +import { IstioIngressGatewayHelmAddon } from './istio/istioIngressGatewayAddon'; +import { IstioCniHelmAddon } from './istio/istiocniAddon'; export default class SingleNewEksOpenSourceobservabilityPattern { constructor(scope: Construct, id: string) { @@ -46,8 +48,26 @@ export default class SingleNewEksOpenSourceobservabilityPattern { let doc = utils.readYamlDocument(__dirname + '/../common/resources/otel-collector-config.yml'); doc = utils.changeTextBetweenTokens( doc, - "{{ if enableAPIserverJob }}", - "{{ end }}", + "{{ start enableJavaMonJob }}", + "{{ stop enableJavaMonJob }}", + jsonStringnew.context["java.pattern.enabled"] + ); + doc = utils.changeTextBetweenTokens( + doc, + "{{ start enableNginxMonJob }}", + "{{ stop enableNginxMonJob }}", + jsonStringnew.context["nginx.pattern.enabled"] + ); + doc = utils.changeTextBetweenTokens( + doc, + "{{ start enableIstioMonJob }}", + "{{ stop enableIstioMonJob }}", + jsonStringnew.context["istio.pattern.enabled"] + ); + doc = utils.changeTextBetweenTokens( + doc, + "{{ start enableAPIserverJob }}", + "{{ stop enableAPIserverJob }}", jsonStringnew.context["apiserver.pattern.enabled"] ); doc = utils.changeTextBetweenTokens( @@ -88,11 +108,7 @@ export default class SingleNewEksOpenSourceobservabilityPattern { if (utils.valueFromContext(scope, "nginx.pattern.enabled", false)) { ampAddOnProps.openTelemetryCollector = { - manifestPath: __dirname + '/../common/resources/otel-collector-config-new.yml', - manifestParameterMap: { - javaScrapeSampleLimit: 1000, - javaPrometheusMetricsEndpoint: "/metrics" - } + manifestPath: __dirname + '/../common/resources/otel-collector-config-new.yml' }; ampAddOnProps.ampRules?.ruleFilePaths.push( __dirname + '/../common/resources/amp-config/nginx/alerting-rules.yml' @@ -101,11 +117,7 @@ export default class SingleNewEksOpenSourceobservabilityPattern { if (utils.valueFromContext(scope, "istio.pattern.enabled", false)) { ampAddOnProps.openTelemetryCollector = { - manifestPath: __dirname + '/../common/resources/otel-collector-config-new.yml', - manifestParameterMap: { - javaScrapeSampleLimit: 1000, - javaPrometheusMetricsEndpoint: "/metrics" - } + manifestPath: __dirname + '/../common/resources/otel-collector-config-new.yml' }; ampAddOnProps.ampRules?.ruleFilePaths.push( __dirname + '/../common/resources/amp-config/istio/alerting-rules.yml', @@ -125,13 +137,14 @@ export default class SingleNewEksOpenSourceobservabilityPattern { ]; if (utils.valueFromContext(scope, "istio.pattern.enabled", false)) { - const istioControlPlaneAddOnProps = { - version: "1.18.2", - }; addOns.push(new blueprints.addons.IstioBaseAddOn({ version: "1.18.2" })); - addOns.push(new blueprints.addons.IstioControlPlaneAddOn(istioControlPlaneAddOnProps)); + addOns.push(new blueprints.addons.IstioControlPlaneAddOn({ + version: "1.18.2" + })); + addOns.push(new IstioIngressGatewayHelmAddon); + addOns.push(new IstioCniHelmAddon); } const mngProps: blueprints.MngClusterProviderProps = { diff --git a/lib/single-new-eks-opensource-observability-pattern/istio/istioIngressGatewayAddon.ts b/lib/single-new-eks-opensource-observability-pattern/istio/istioIngressGatewayAddon.ts new file mode 100644 index 00000000..3387ccbd --- /dev/null +++ b/lib/single-new-eks-opensource-observability-pattern/istio/istioIngressGatewayAddon.ts @@ -0,0 +1,26 @@ +import 'source-map-support/register'; +import * as blueprints from '@aws-quickstart/eks-blueprints'; +import { Construct } from 'constructs'; +import { dependable } from '@aws-quickstart/eks-blueprints/dist/utils'; + +const defaultProps: blueprints.HelmAddOnProps = { + name: 'istio-ingressgateway', + release: 'ingressgateway', + namespace: 'istio-system', + chart: 'gateway', + version: '1.18.2', + repository: 'https://istio-release.storage.googleapis.com/charts', + values: {}, +}; + +export class IstioIngressGatewayHelmAddon extends blueprints.HelmAddOn { + + constructor() { + super({...defaultProps}); + } + @dependable(blueprints.addons.IstioBaseAddOn.name,blueprints.addons.IstioControlPlaneAddOn.name) + deploy(clusterInfo: blueprints.ClusterInfo): void | Promise { + const chart = this.addHelmChart(clusterInfo, this.props.values); + return Promise.resolve(chart); + } +} \ No newline at end of file diff --git a/lib/single-new-eks-opensource-observability-pattern/istio/istiocniAddon.ts b/lib/single-new-eks-opensource-observability-pattern/istio/istiocniAddon.ts new file mode 100644 index 00000000..30aae5b6 --- /dev/null +++ b/lib/single-new-eks-opensource-observability-pattern/istio/istiocniAddon.ts @@ -0,0 +1,26 @@ +import 'source-map-support/register'; +import * as blueprints from '@aws-quickstart/eks-blueprints'; +import { Construct } from 'constructs'; +import { dependable } from '@aws-quickstart/eks-blueprints/dist/utils'; + +const defaultProps: blueprints.HelmAddOnProps = { + name: 'istio-cni', + release: 'cni', + namespace: 'istio-system', + chart: 'cni', + version: '1.18.2', + repository: 'https://istio-release.storage.googleapis.com/charts', + values: {}, +}; + +export class IstioCniHelmAddon extends blueprints.HelmAddOn { + + constructor() { + super({...defaultProps}); + } + @dependable(blueprints.addons.IstioBaseAddOn.name,blueprints.addons.IstioControlPlaneAddOn.name) + deploy(clusterInfo: blueprints.ClusterInfo): void | Promise { + const chart = this.addHelmChart(clusterInfo, this.props.values); + return Promise.resolve(chart); + } +} \ No newline at end of file From f651acaaf3b1867fc9d18f490afd0e0fc9d2d48a Mon Sep 17 00:00:00 2001 From: Elamaran Shanmugam Date: Wed, 24 Jan 2024 19:20:21 -0500 Subject: [PATCH 5/7] Istio Monitoring Pattern - Fixing Dash Issues and CDK --- cdk.json | 8 -------- 1 file changed, 8 deletions(-) diff --git a/cdk.json b/cdk.json index c9940b54..446adb81 100644 --- a/cdk.json +++ b/cdk.json @@ -33,21 +33,13 @@ "GRAFANA_NODEEXP_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/nodeexporter-nodes.json", "GRAFANA_NODES_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/nodes.json", "GRAFANA_WORKLOADS_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/workloads.json", - "GRAFANA_ISTIO_CP_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/v0.2.0/artifacts/grafana-dashboards/eks/istio/istio-control-plane-dashboard.json", - "GRAFANA_ISTIO_MESH_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/v0.2.0/artifacts/grafana-dashboards/eks/istio/istio-mesh-dashboard.json", - "GRAFANA_ISTIO_PERF_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/v0.2.0/artifacts/grafana-dashboards/eks/istio/istio-performance-dashboard.json", - "GRAFANA_ISTIO_SERVICE_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/v0.2.0/artifacts/grafana-dashboards/eks/istio/istio-service-dashboard.json" }, "kustomizations": [ { "kustomizationPath": "./artifacts/grafana-operator-manifests/eks/infrastructure" - }, - { - "kustomizationPath": "./artifacts/grafana-operator-manifests/eks/istio" } ] }, - "istio.pattern.enabled": true, "gpuNodeGroup": { "instanceType": "g4dn.xlarge", "desiredSize": 2, From 1b81799c5f83aa138d6f71f8398cb17c01f18c1d Mon Sep 17 00:00:00 2001 From: Elamaran Shanmugam Date: Wed, 24 Jan 2024 19:20:55 -0500 Subject: [PATCH 6/7] Istio Monitoring Pattern - Fixing Dash Issues and CDK.json --- cdk.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cdk.json b/cdk.json index 446adb81..dc36022b 100644 --- a/cdk.json +++ b/cdk.json @@ -32,7 +32,7 @@ "GRAFANA_NSWRKLDS_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/namespace-workloads.json", "GRAFANA_NODEEXP_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/nodeexporter-nodes.json", "GRAFANA_NODES_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/nodes.json", - "GRAFANA_WORKLOADS_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/workloads.json", + "GRAFANA_WORKLOADS_DASH_URL" : "https://raw.githubusercontent.com/aws-observability/aws-observability-accelerator/main/artifacts/grafana-dashboards/eks/infrastructure/workloads.json" }, "kustomizations": [ { From 213f48aff2a9173f1b8091c8fb818441239f2297 Mon Sep 17 00:00:00 2001 From: Elamaran Shanmugam Date: Thu, 25 Jan 2024 07:40:17 -0500 Subject: [PATCH 7/7] Fixing Graviton Pattern for Istio --- .../resources/otel-collector-config.yml | 8 +-- .../graviton-index.ts | 60 ++++++++++++++++++- .../index.ts | 6 +- 3 files changed, 67 insertions(+), 7 deletions(-) diff --git a/lib/common/resources/otel-collector-config.yml b/lib/common/resources/otel-collector-config.yml index 68652e9d..30eeb6c3 100644 --- a/lib/common/resources/otel-collector-config.yml +++ b/lib/common/resources/otel-collector-config.yml @@ -1617,8 +1617,8 @@ spec: target_label: nodename {{ start enableJavaMonJob }} - job_name: 'kubernetes-java-jmx' - sample_limit: {{ .Values.javaScrapeSampleLimit }} - metrics_path: {{ .Values.javaPrometheusMetricsEndpoint }} + sample_limit: {{javaScrapeSampleLimit}} + metrics_path: {{javaPrometheusMetricsEndpoint}} kubernetes_sd_configs: - role: pod relabel_configs: @@ -1650,8 +1650,8 @@ spec: {{ start enableNginxMonJob }} - job_name: 'kubernetes-nginx' - sample_limit: {{ .Values.nginxScrapeSampleLimit }} - metrics_path: {{ .Values.nginxPrometheusMetricsEndpoint }} + sample_limit: {{nginxScrapeSampleLimit}} + metrics_path: {{nginxPrometheusMetricsEndpoint}} kubernetes_sd_configs: - role: pod relabel_configs: diff --git a/lib/single-new-eks-opensource-observability-pattern/graviton-index.ts b/lib/single-new-eks-opensource-observability-pattern/graviton-index.ts index 8264d68e..ef89bc8e 100644 --- a/lib/single-new-eks-opensource-observability-pattern/graviton-index.ts +++ b/lib/single-new-eks-opensource-observability-pattern/graviton-index.ts @@ -7,6 +7,8 @@ import * as eks from 'aws-cdk-lib/aws-eks'; import * as ec2 from 'aws-cdk-lib/aws-ec2'; import { ObservabilityBuilder } from '@aws-quickstart/eks-blueprints'; import * as fs from 'fs'; +import { IstioIngressGatewayHelmAddon } from './istio/istioIngressGatewayAddon'; +import { IstioCniHelmAddon } from './istio/istiocniAddon'; export default class SingleNewEksGravitonOpenSourceObservabilityPattern { constructor(scope: Construct, id: string) { @@ -43,8 +45,26 @@ export default class SingleNewEksGravitonOpenSourceObservabilityPattern { let doc = utils.readYamlDocument(__dirname + '/../common/resources/otel-collector-config.yml'); doc = utils.changeTextBetweenTokens( doc, - "{{ if enableAPIserverJob }}", - "{{ end }}", + "{{ start enableJavaMonJob }}", + "{{ stop enableJavaMonJob }}", + jsonStringnew.context["java.pattern.enabled"] + ); + doc = utils.changeTextBetweenTokens( + doc, + "{{ start enableNginxMonJob }}", + "{{ stop enableNginxMonJob }}", + jsonStringnew.context["nginx.pattern.enabled"] + ); + doc = utils.changeTextBetweenTokens( + doc, + "{{ start enableIstioMonJob }}", + "{{ stop enableIstioMonJob }}", + jsonStringnew.context["istio.pattern.enabled"] + ); + doc = utils.changeTextBetweenTokens( + doc, + "{{ start enableAPIserverJob }}", + "{{ stop enableAPIserverJob }}", jsonStringnew.context["apiserver.pattern.enabled"] ); doc = utils.changeTextBetweenTokens( @@ -83,6 +103,29 @@ export default class SingleNewEksGravitonOpenSourceObservabilityPattern { ); } + if (utils.valueFromContext(scope, "nginx.pattern.enabled", false)) { + ampAddOnProps.openTelemetryCollector = { + manifestPath: __dirname + '/../common/resources/otel-collector-config-new.yml', + manifestParameterMap: { + nginxScrapeSampleLimit: 1000, + nginxPrometheusMetricsEndpoint: "/metrics" + } + }; + ampAddOnProps.ampRules?.ruleFilePaths.push( + __dirname + '/../common/resources/amp-config/nginx/alerting-rules.yml' + ); + } + + if (utils.valueFromContext(scope, "istio.pattern.enabled", false)) { + ampAddOnProps.openTelemetryCollector = { + manifestPath: __dirname + '/../common/resources/otel-collector-config-new.yml' + }; + ampAddOnProps.ampRules?.ruleFilePaths.push( + __dirname + '/../common/resources/amp-config/istio/alerting-rules.yml', + __dirname + '/../common/resources/amp-config/istio/recording-rules.yml' + ); + } + Reflect.defineMetadata("ordered", true, blueprints.addons.GrafanaOperatorAddon); const addOns: Array = [ new blueprints.addons.CloudWatchLogsAddon({ @@ -94,10 +137,23 @@ export default class SingleNewEksGravitonOpenSourceObservabilityPattern { new GrafanaOperatorSecretAddon(), ]; + if (utils.valueFromContext(scope, "istio.pattern.enabled", false)) { + addOns.push(new blueprints.addons.IstioBaseAddOn({ + version: "1.18.2" + })); + addOns.push(new blueprints.addons.IstioControlPlaneAddOn({ + version: "1.18.2" + })); + addOns.push(new IstioIngressGatewayHelmAddon); + addOns.push(new IstioCniHelmAddon); + } + const mngProps: blueprints.MngClusterProviderProps = { version: eks.KubernetesVersion.of("1.27"), instanceTypes: [new ec2.InstanceType("m7g.large")], amiType: eks.NodegroupAmiType.AL2_ARM_64, + desiredSize: 2, + maxSize: 3, }; ObservabilityBuilder.builder() diff --git a/lib/single-new-eks-opensource-observability-pattern/index.ts b/lib/single-new-eks-opensource-observability-pattern/index.ts index 13ea8142..42032c1b 100644 --- a/lib/single-new-eks-opensource-observability-pattern/index.ts +++ b/lib/single-new-eks-opensource-observability-pattern/index.ts @@ -108,7 +108,11 @@ export default class SingleNewEksOpenSourceobservabilityPattern { if (utils.valueFromContext(scope, "nginx.pattern.enabled", false)) { ampAddOnProps.openTelemetryCollector = { - manifestPath: __dirname + '/../common/resources/otel-collector-config-new.yml' + manifestPath: __dirname + '/../common/resources/otel-collector-config-new.yml', + manifestParameterMap: { + nginxScrapeSampleLimit: 1000, + nginxPrometheusMetricsEndpoint: "/metrics" + } }; ampAddOnProps.ampRules?.ruleFilePaths.push( __dirname + '/../common/resources/amp-config/nginx/alerting-rules.yml'