From bd492486142ddd6a2989071f36b0c1e3c73dfcbd Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Simillon Date: Wed, 15 Nov 2023 13:52:57 +0100 Subject: [PATCH] feat : convert module integration_aws-elasticsearch to gen format feat : convert module integration_aws-elasticache-common to gen format feat : support for multiple signal in rules feat : support for multiple time the same severity in rules --- .../README.md | 10 +- .../common-filters.tf | 5 +- .../common-locals.tf | 45 +- .../common-modules.tf | 9 +- .../common-variables.tf | 79 ++- .../common-versions.tf | 10 +- .../conf/00-heartbeat.yaml | 12 + .../conf/01-evictions.yaml | 21 + .../conf/02-max-connection.yaml | 17 + .../conf/03-current-connection.yaml | 17 + .../conf/04-swap.yaml | 21 + .../conf/05-free-memory.yaml | 21 + .../conf/06-evictions-growing.yaml | 21 + ...ectors-elasticache.tf => detectors-gen.tf} | 135 ++--- .../{variables.tf => variables-gen.tf} | 352 +++++++----- .../integration_aws-elasticsearch/README.md | 15 +- .../common-filters.tf | 5 +- .../common-locals.tf | 45 +- .../common-modules.tf | 9 +- .../common-variables.tf | 79 ++- .../common-versions.tf | 10 +- .../conf/00-heartbeat.yaml | 13 + .../conf/03-5xx.yaml | 39 +- .../conf/05-cluster-status.yaml | 25 + .../conf/06-free-space.yaml | 21 + .../conf/07-ultrawarm-free-space.yaml | 21 + .../conf/08-cluster-cpu.yaml | 24 + .../conf/09-master-cpu.yaml | 19 + .../detectors-elasticsearch.tf | 240 -------- .../detectors-gen.tf | 303 +++++++++- .../integration_aws-elasticsearch/moved.tf | 9 + .../integration_aws-elasticsearch/outputs.tf | 16 +- .../variables-gen.tf | 540 +++++++++++++++++- .../variables.tf | 373 ------------ scripts/templates/detector.tf.j2 | 6 +- 35 files changed, 1716 insertions(+), 871 deletions(-) mode change 120000 => 100644 modules/integration_aws-elasticache-common/common-filters.tf mode change 120000 => 100644 modules/integration_aws-elasticache-common/common-locals.tf mode change 120000 => 100644 modules/integration_aws-elasticache-common/common-modules.tf mode change 120000 => 100644 modules/integration_aws-elasticache-common/common-variables.tf mode change 120000 => 100644 modules/integration_aws-elasticache-common/common-versions.tf create mode 100644 modules/integration_aws-elasticache-common/conf/00-heartbeat.yaml create mode 100644 modules/integration_aws-elasticache-common/conf/01-evictions.yaml create mode 100644 modules/integration_aws-elasticache-common/conf/02-max-connection.yaml create mode 100644 modules/integration_aws-elasticache-common/conf/03-current-connection.yaml create mode 100644 modules/integration_aws-elasticache-common/conf/04-swap.yaml create mode 100644 modules/integration_aws-elasticache-common/conf/05-free-memory.yaml create mode 100644 modules/integration_aws-elasticache-common/conf/06-evictions-growing.yaml rename modules/integration_aws-elasticache-common/{detectors-elasticache.tf => detectors-gen.tf} (65%) rename modules/integration_aws-elasticache-common/{variables.tf => variables-gen.tf} (73%) mode change 120000 => 100644 modules/integration_aws-elasticsearch/common-filters.tf mode change 120000 => 100644 modules/integration_aws-elasticsearch/common-locals.tf mode change 120000 => 100644 modules/integration_aws-elasticsearch/common-modules.tf mode change 120000 => 100644 modules/integration_aws-elasticsearch/common-variables.tf mode change 120000 => 100644 modules/integration_aws-elasticsearch/common-versions.tf create mode 100644 modules/integration_aws-elasticsearch/conf/00-heartbeat.yaml create mode 100644 modules/integration_aws-elasticsearch/conf/05-cluster-status.yaml create mode 100644 modules/integration_aws-elasticsearch/conf/06-free-space.yaml create mode 100644 modules/integration_aws-elasticsearch/conf/07-ultrawarm-free-space.yaml create mode 100644 modules/integration_aws-elasticsearch/conf/08-cluster-cpu.yaml create mode 100644 modules/integration_aws-elasticsearch/conf/09-master-cpu.yaml delete mode 100644 modules/integration_aws-elasticsearch/detectors-elasticsearch.tf create mode 100644 modules/integration_aws-elasticsearch/moved.tf delete mode 100644 modules/integration_aws-elasticsearch/variables.tf diff --git a/modules/integration_aws-elasticache-common/README.md b/modules/integration_aws-elasticache-common/README.md index cc025514c..6452286de 100644 --- a/modules/integration_aws-elasticache-common/README.md +++ b/modules/integration_aws-elasticache-common/README.md @@ -57,7 +57,7 @@ Note the following parameters: These 3 parameters along with all variables defined in [common-variables.tf](common-variables.tf) are common to all [modules](../) in this repository. Other variables, specific to this module, are available in -[variables.tf](variables.tf). +[variables-gen.tf](variables-gen.tf). In general, the default configuration "works" but all of these Terraform [variables](https://www.terraform.io/language/values/variables) make it possible to customize the detectors behavior to better fit your needs. @@ -77,11 +77,11 @@ This module creates the following SignalFx detectors which could contain one or |---|---|---|---|---|---| |AWS ElastiCache heartbeat|X|-|-|-|-| |AWS ElastiCache evictions|X|X|-|-|-| -|AWS ElastiCache connections over max allowed|X|-|-|-|-| -|AWS ElastiCache current connections|X|-|-|-|-| +|AWS ElastiCache max connection|X|-|-|-|-| +|AWS ElastiCache no connection|X|-|-|-|-| |AWS ElastiCache swap|X|X|-|-|-| -|AWS ElastiCache freeable memory|-|X|X|-|-| -|AWS ElastiCache evictions changing rate grows|X|X|-|-|-| +|AWS ElastiCache free memory|-|X|X|-|-| +|AWS ElastiCache evictions growing|X|X|-|-|-| ## How to collect required metrics? diff --git a/modules/integration_aws-elasticache-common/common-filters.tf b/modules/integration_aws-elasticache-common/common-filters.tf deleted file mode 120000 index 9272cf517..000000000 --- a/modules/integration_aws-elasticache-common/common-filters.tf +++ /dev/null @@ -1 +0,0 @@ -../../common/module/filters-integration-aws.tf \ No newline at end of file diff --git a/modules/integration_aws-elasticache-common/common-filters.tf b/modules/integration_aws-elasticache-common/common-filters.tf new file mode 100644 index 000000000..cf7b9826c --- /dev/null +++ b/modules/integration_aws-elasticache-common/common-filters.tf @@ -0,0 +1,4 @@ +locals { + filters = "filter('aws_tag_env', '${var.environment}') and filter('aws_tag_sfx_monitored', 'true')" +} + diff --git a/modules/integration_aws-elasticache-common/common-locals.tf b/modules/integration_aws-elasticache-common/common-locals.tf deleted file mode 120000 index 5672d21ab..000000000 --- a/modules/integration_aws-elasticache-common/common-locals.tf +++ /dev/null @@ -1 +0,0 @@ -../../common/module/locals.tf \ No newline at end of file diff --git a/modules/integration_aws-elasticache-common/common-locals.tf b/modules/integration_aws-elasticache-common/common-locals.tf new file mode 100644 index 000000000..51a7650c1 --- /dev/null +++ b/modules/integration_aws-elasticache-common/common-locals.tf @@ -0,0 +1,44 @@ +locals { + heartbeat_auto_resolve_after = "1s" + not_running_vm_filters_gcp = "(not filter('gcp_status', '{Code=3, Name=STOPPING}', '{Code=4, Name=TERMINATED}'))" + not_running_vm_filters_aws = "(not filter('aws_state', '{Code: 32,Name: shutting-down}', '{Code: 48,Name: terminated}', '{Code: 64,Name: stopping}', '{Code: 80,Name: stopped}'))" + not_running_vm_filters_azure = "(not filter('azure_power_state', 'PowerState/stopping', 'PowerState/stopped', 'PowerState/deallocating', 'PowerState/deallocated'))" + not_running_vm_filters = format( + "%s and %s and %s", + local.not_running_vm_filters_aws, + local.not_running_vm_filters_gcp, + local.not_running_vm_filters_azure + ) + detector_name_prefix = "${join("", formatlist("[%s]", var.prefixes))}[${var.environment}]" + common_tags = concat(["terraform", var.environment], var.teams) + rule_subject_prefix = "[{{ruleSeverity}}]{{{detectorName}}} {{{readableRule}}}" + rule_subject_suffix = "on {{{dimensions}}}" + rule_subject = format("%s ({{inputs.signal.value}}) %s", local.rule_subject_prefix, local.rule_subject_suffix) + rule_subject_novalue = format("%s %s", local.rule_subject_prefix, local.rule_subject_suffix) + rule_body = <<-EOF + **Alert**: + *[{{ruleSeverity}}]{{{detectorName}}} {{{readableRule}}} ({{inputs.signal.value}})* + {{#if anomalous}} + **Triggered at**: + *{{timestamp}}* + {{else}} + **Cleared at**: + *{{timestamp}}* + {{/if}} + + {{#notEmpty dimensions}} + **Dimensions**: + *{{{dimensions}}}* + {{/notEmpty}} + + {{#if anomalous}} + {{#if runbookUrl}}**Runbook**: + Go to [this page]({{{runbookUrl}}}) for help and analysis. + {{/if}} + + {{#if tip}}**Tip**: + {{{tip}}} + {{/if}} + {{/if}} +EOF +} diff --git a/modules/integration_aws-elasticache-common/common-modules.tf b/modules/integration_aws-elasticache-common/common-modules.tf deleted file mode 120000 index 8c81ef377..000000000 --- a/modules/integration_aws-elasticache-common/common-modules.tf +++ /dev/null @@ -1 +0,0 @@ -../../common/module/modules.tf \ No newline at end of file diff --git a/modules/integration_aws-elasticache-common/common-modules.tf b/modules/integration_aws-elasticache-common/common-modules.tf new file mode 100644 index 000000000..79d068bdd --- /dev/null +++ b/modules/integration_aws-elasticache-common/common-modules.tf @@ -0,0 +1,8 @@ +module "filtering" { + source = "../internal_filtering" + + filtering_default = local.filters + filtering_custom = var.filtering_custom + append_mode = var.filtering_append +} + diff --git a/modules/integration_aws-elasticache-common/common-variables.tf b/modules/integration_aws-elasticache-common/common-variables.tf deleted file mode 120000 index f3037a584..000000000 --- a/modules/integration_aws-elasticache-common/common-variables.tf +++ /dev/null @@ -1 +0,0 @@ -../../common/module/variables.tf \ No newline at end of file diff --git a/modules/integration_aws-elasticache-common/common-variables.tf b/modules/integration_aws-elasticache-common/common-variables.tf new file mode 100644 index 000000000..80cc77eee --- /dev/null +++ b/modules/integration_aws-elasticache-common/common-variables.tf @@ -0,0 +1,78 @@ +# Global + +variable "environment" { + description = "Infrastructure environment" + type = string +} + +variable "notifications" { + description = "Default notification recipients list per severity" + type = object({ + critical = list(string) + major = list(string) + minor = list(string) + warning = list(string) + info = list(string) + }) +} + +variable "prefixes" { + description = "Prefixes list to prepend between brackets on every monitors names before environment" + type = list(string) + default = [] +} + +variable "filtering_custom" { + description = "Filters as SignalFlow string to either replace or append to default filtering convention which is the only one used if not defined" + type = string + default = null +} + +variable "filtering_append" { + description = "If true, the `filtering_custom` string will be appended to the default filtering convention instead of fully replace it" + type = bool + default = false +} + +variable "detectors_disabled" { + description = "Disable all detectors in this module" + type = bool + default = false +} + +variable "runbook_url" { + description = "Default runbook URL to apply to all detectors (if not overridden at detector level)" + type = string + default = "" +} + +variable "authorized_writer_teams" { + description = "List of teams IDs authorized (with admins) to edit the detector. If defined, it requires an user token to work" + type = list(string) + default = null +} + +variable "teams" { + description = "List of teams IDs to associate the detector to" + type = list(string) + default = [] +} + +variable "message_subject" { + description = "The subject to use in alerting rules messages which overrides the default template" + type = string + default = "" +} + +variable "message_body" { + description = "The body to use in alerting rules messages which overrides the default template" + type = string + default = "" +} + +variable "extra_tags" { + description = "List of tags to add to the detectors resources, useful to find detectors " + type = list(string) + default = [] +} + diff --git a/modules/integration_aws-elasticache-common/common-versions.tf b/modules/integration_aws-elasticache-common/common-versions.tf deleted file mode 120000 index fa7f5509f..000000000 --- a/modules/integration_aws-elasticache-common/common-versions.tf +++ /dev/null @@ -1 +0,0 @@ -../../common/module/versions.tf \ No newline at end of file diff --git a/modules/integration_aws-elasticache-common/common-versions.tf b/modules/integration_aws-elasticache-common/common-versions.tf new file mode 100644 index 000000000..d77818c04 --- /dev/null +++ b/modules/integration_aws-elasticache-common/common-versions.tf @@ -0,0 +1,9 @@ +terraform { + required_providers { + signalfx = { + source = "splunk-terraform/signalfx" + version = ">= 7.0.0" + } + } + required_version = ">= 0.12.26" +} diff --git a/modules/integration_aws-elasticache-common/conf/00-heartbeat.yaml b/modules/integration_aws-elasticache-common/conf/00-heartbeat.yaml new file mode 100644 index 000000000..a03b1c779 --- /dev/null +++ b/modules/integration_aws-elasticache-common/conf/00-heartbeat.yaml @@ -0,0 +1,12 @@ +module: AWS ElastiCache +name: heartbeat + +transformation: false +aggregation: ".mean(by=['CacheClusterId'])" +filtering: "filter('stat', 'mean') and filter('namespace', 'AWS/ElastiCache')" + +signals: + signal: + metric: CPUUtilization +rules: + critical: diff --git a/modules/integration_aws-elasticache-common/conf/01-evictions.yaml b/modules/integration_aws-elasticache-common/conf/01-evictions.yaml new file mode 100644 index 000000000..5f0c0614a --- /dev/null +++ b/modules/integration_aws-elasticache-common/conf/01-evictions.yaml @@ -0,0 +1,21 @@ +module: AWS ElastiCache +name: "Evictions" + +transformation: ".sum(over='15m')" +aggregation: true + +filtering: "filter('namespace', 'AWS/ElastiCache')" + +signals: + signal: + metric: "Evictions" + filter: "filter('stat', 'mean') and filter('CacheNodeId', '*')" + +rules: + major: + threshold: 0 + comparator: ">" + dependency: critical + critical: + threshold: 30 + comparator: ">" diff --git a/modules/integration_aws-elasticache-common/conf/02-max-connection.yaml b/modules/integration_aws-elasticache-common/conf/02-max-connection.yaml new file mode 100644 index 000000000..fab8cdbf8 --- /dev/null +++ b/modules/integration_aws-elasticache-common/conf/02-max-connection.yaml @@ -0,0 +1,17 @@ +module: AWS ElastiCache +name: "Max connection" + +transformation: ".max(over='5m')" +aggregation: true + +filtering: "filter('namespace', 'AWS/ElastiCache')" + +signals: + signal: + metric: "CurrConnections" + filter: "filter('stat', 'upper') and filter('CacheNodeId', '*')" + +rules: + critical: + threshold: 64999 + comparator: ">" diff --git a/modules/integration_aws-elasticache-common/conf/03-current-connection.yaml b/modules/integration_aws-elasticache-common/conf/03-current-connection.yaml new file mode 100644 index 000000000..12c4ca778 --- /dev/null +++ b/modules/integration_aws-elasticache-common/conf/03-current-connection.yaml @@ -0,0 +1,17 @@ +module: AWS ElastiCache +name: "No connection" + +transformation: ".min(over='5m')" +aggregation: true + +filtering: "filter('namespace', 'AWS/ElastiCache')" + +signals: + signal: + metric: "CurrConnections" + filter: "filter('stat', 'lower') and filter('CacheNodeId', '*')" + +rules: + critical: + threshold: 0 + comparator: "<=" diff --git a/modules/integration_aws-elasticache-common/conf/04-swap.yaml b/modules/integration_aws-elasticache-common/conf/04-swap.yaml new file mode 100644 index 000000000..ee4615b0a --- /dev/null +++ b/modules/integration_aws-elasticache-common/conf/04-swap.yaml @@ -0,0 +1,21 @@ +module: AWS ElastiCache +name: "Swap" + +transformation: ".min(over='5m')" +aggregation: true + +filtering: "filter('namespace', 'AWS/ElastiCache')" + +signals: + signal: + metric: "SwapUsage" + filter: "filter('stat', 'upper') and filter('CacheNodeId', '*')" + +rules: + major: + threshold: 0 + comparator: ">" + dependency: critical + critical: + threshold: 50000000 + comparator: ">" diff --git a/modules/integration_aws-elasticache-common/conf/05-free-memory.yaml b/modules/integration_aws-elasticache-common/conf/05-free-memory.yaml new file mode 100644 index 000000000..12367fffe --- /dev/null +++ b/modules/integration_aws-elasticache-common/conf/05-free-memory.yaml @@ -0,0 +1,21 @@ +module: AWS ElastiCache +name: "Free memory" + +transformation: ".rateofchange().mean(over='15m')" +aggregation: true + +filtering: "filter('namespace', 'AWS/ElastiCache')" + +signals: + signal: + metric: "FreeableMemory" + filter: "filter('stat', 'lower') and filter('CacheNodeId', '*')" + +rules: + minor: + threshold: -50 + comparator: "<" + dependency: major + major: + threshold: -70 + comparator: "<" diff --git a/modules/integration_aws-elasticache-common/conf/06-evictions-growing.yaml b/modules/integration_aws-elasticache-common/conf/06-evictions-growing.yaml new file mode 100644 index 000000000..ad4d9c42b --- /dev/null +++ b/modules/integration_aws-elasticache-common/conf/06-evictions-growing.yaml @@ -0,0 +1,21 @@ +module: AWS ElastiCache +name: "Evictions growing" + +transformation: ".mean(over='5m').rateofchange().scale(100)" +aggregation: true + +filtering: "filter('namespace', 'AWS/ElastiCache')" + +signals: + signal: + metric: "Evictions" + filter: "filter('stat', 'mean') and filter('CacheNodeId', '*')" + +rules: + major: + threshold: 10 + comparator: ">" + dependency: critical + critical: + threshold: 30 + comparator: ">" diff --git a/modules/integration_aws-elasticache-common/detectors-elasticache.tf b/modules/integration_aws-elasticache-common/detectors-gen.tf similarity index 65% rename from modules/integration_aws-elasticache-common/detectors-elasticache.tf rename to modules/integration_aws-elasticache-common/detectors-gen.tf index 6ce99984d..84cd14d6d 100644 --- a/modules/integration_aws-elasticache-common/detectors-elasticache.tf +++ b/modules/integration_aws-elasticache-common/detectors-gen.tf @@ -7,7 +7,8 @@ resource "signalfx_detector" "heartbeat" { program_text = <<-EOF from signalfx.detectors.not_reporting import not_reporting - signal = data('CPUUtilization', filter=filter('stat', 'mean') and filter('namespace', 'AWS/ElastiCache') and ${module.filtering.signalflow})${var.heartbeat_aggregation_function}.publish('signal') + base_filtering = filter('stat', 'mean') and filter('namespace', 'AWS/ElastiCache') + signal = data('CPUUtilization', filter=base_filtering and ${module.filtering.signalflow})${var.heartbeat_aggregation_function}.publish('signal') not_reporting.detector(stream=signal, resource_identifier=None, duration='${var.heartbeat_timeframe}', auto_resolve_after='${local.heartbeat_auto_resolve_after}').publish('CRIT') EOF @@ -34,17 +35,18 @@ resource "signalfx_detector" "evictions" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - signal = data('Evictions', filter=filter('namespace', 'AWS/ElastiCache') and filter('stat', 'mean') and filter('CacheNodeId', '*') and ${module.filtering.signalflow})${var.evictions_aggregation_function}${var.evictions_transformation_function}.publish('signal') - detect(when(signal > ${var.evictions_threshold_critical})).publish('CRIT') - detect(when(signal > ${var.evictions_threshold_major}) and (not when(signal > ${var.evictions_threshold_critical}))).publish('MAJOR') + base_filtering = filter('namespace', 'AWS/ElastiCache') + signal = data('Evictions', filter=base_filtering and filter('stat', 'mean') and filter('CacheNodeId', '*') and ${module.filtering.signalflow})${var.evictions_aggregation_function}${var.evictions_transformation_function}.publish('signal') + detect(when(signal > ${var.evictions_threshold_major}, lasting=%{if var.evictions_lasting_duration_major == null}None%{else}'${var.evictions_lasting_duration_major}'%{endif}, at_least=${var.evictions_at_least_percentage_major}) and (not when(signal > ${var.evictions_threshold_critical}, lasting=%{if var.evictions_lasting_duration_critical == null}None%{else}'${var.evictions_lasting_duration_critical}'%{endif}, at_least=${var.evictions_at_least_percentage_critical}))).publish('MAJOR') + detect(when(signal > ${var.evictions_threshold_critical}, lasting=%{if var.evictions_lasting_duration_critical == null}None%{else}'${var.evictions_lasting_duration_critical}'%{endif}, at_least=${var.evictions_at_least_percentage_critical})).publish('CRIT') EOF rule { - description = "is too high > ${var.evictions_threshold_critical}" - severity = "Critical" - detect_label = "CRIT" - disabled = coalesce(var.evictions_disabled_critical, var.evictions_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.evictions_notifications, "critical", []), var.notifications.critical), null) + description = "is too high > ${var.evictions_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.evictions_disabled_major, var.evictions_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.evictions_notifications, "major", []), var.notifications.major), null) runbook_url = try(coalesce(var.evictions_runbook_url, var.runbook_url), "") tip = var.evictions_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject @@ -52,11 +54,11 @@ EOF } rule { - description = "is too high > ${var.evictions_threshold_major}" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.evictions_disabled_major, var.evictions_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.evictions_notifications, "major", []), var.notifications.major), null) + description = "is too high > ${var.evictions_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.evictions_disabled_critical, var.evictions_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.evictions_notifications, "critical", []), var.notifications.critical), null) runbook_url = try(coalesce(var.evictions_runbook_url, var.runbook_url), "") tip = var.evictions_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject @@ -67,22 +69,23 @@ EOF } resource "signalfx_detector" "max_connection" { - name = format("%s %s", local.detector_name_prefix, "AWS ElastiCache connections over max allowed") + name = format("%s %s", local.detector_name_prefix, "AWS ElastiCache max connection") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - signal = data('CurrConnections', filter=filter('namespace', 'AWS/ElastiCache') and filter('stat', 'upper') and filter('CacheNodeId', '*') and ${module.filtering.signalflow})${var.max_connection_aggregation_function}${var.max_connection_transformation_function}.publish('signal') - detect(when(signal > ${var.max_connection_threshold_critical})).publish('CRIT') + base_filtering = filter('namespace', 'AWS/ElastiCache') + signal = data('CurrConnections', filter=base_filtering and filter('stat', 'upper') and filter('CacheNodeId', '*') and ${module.filtering.signalflow})${var.max_connection_aggregation_function}${var.max_connection_transformation_function}.publish('signal') + detect(when(signal > ${var.max_connection_threshold_critical}, lasting=%{if var.max_connection_lasting_duration_critical == null}None%{else}'${var.max_connection_lasting_duration_critical}'%{endif}, at_least=${var.max_connection_at_least_percentage_critical})).publish('CRIT') EOF rule { description = "is too high > ${var.max_connection_threshold_critical}" severity = "Critical" detect_label = "CRIT" - disabled = coalesce(var.max_connection_disabled_critical, var.max_connection_disabled, var.detectors_disabled) + disabled = coalesce(var.max_connection_disabled, var.detectors_disabled) notifications = try(coalescelist(lookup(var.max_connection_notifications, "critical", []), var.notifications.critical), null) runbook_url = try(coalesce(var.max_connection_runbook_url, var.runbook_url), "") tip = var.max_connection_tip @@ -94,22 +97,23 @@ EOF } resource "signalfx_detector" "no_connection" { - name = format("%s %s", local.detector_name_prefix, "AWS ElastiCache current connections") + name = format("%s %s", local.detector_name_prefix, "AWS ElastiCache no connection") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - signal = data('CurrConnections', filter=filter('namespace', 'AWS/ElastiCache') and filter('stat', 'lower') and filter('CacheNodeId', '*') and ${module.filtering.signalflow})${var.no_connection_aggregation_function}${var.no_connection_transformation_function}.publish('signal') - detect(when(signal <= ${var.no_connection_threshold_critical})).publish('CRIT') + base_filtering = filter('namespace', 'AWS/ElastiCache') + signal = data('CurrConnections', filter=base_filtering and filter('stat', 'lower') and filter('CacheNodeId', '*') and ${module.filtering.signalflow})${var.no_connection_aggregation_function}${var.no_connection_transformation_function}.publish('signal') + detect(when(signal <= ${var.no_connection_threshold_critical}, lasting=%{if var.no_connection_lasting_duration_critical == null}None%{else}'${var.no_connection_lasting_duration_critical}'%{endif}, at_least=${var.no_connection_at_least_percentage_critical})).publish('CRIT') EOF rule { - description = "are too low <= ${var.no_connection_threshold_critical}" + description = "is too low <= ${var.no_connection_threshold_critical}" severity = "Critical" detect_label = "CRIT" - disabled = coalesce(var.no_connection_disabled_critical, var.no_connection_disabled, var.detectors_disabled) + disabled = coalesce(var.no_connection_disabled, var.detectors_disabled) notifications = try(coalescelist(lookup(var.no_connection_notifications, "critical", []), var.notifications.critical), null) runbook_url = try(coalesce(var.no_connection_runbook_url, var.runbook_url), "") tip = var.no_connection_tip @@ -128,17 +132,18 @@ resource "signalfx_detector" "swap" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - signal = data('SwapUsage', filter=filter('namespace', 'AWS/ElastiCache') and filter('stat', 'upper') and filter('CacheNodeId', '*') and ${module.filtering.signalflow})${var.swap_aggregation_function}${var.swap_transformation_function}.publish('signal') - detect(when(signal > ${var.swap_threshold_critical})).publish('CRIT') - detect(when(signal > ${var.swap_threshold_major}) and (not when(signal > ${var.swap_threshold_critical}))).publish('MAJOR') + base_filtering = filter('namespace', 'AWS/ElastiCache') + signal = data('SwapUsage', filter=base_filtering and filter('stat', 'upper') and filter('CacheNodeId', '*') and ${module.filtering.signalflow})${var.swap_aggregation_function}${var.swap_transformation_function}.publish('signal') + detect(when(signal > ${var.swap_threshold_major}, lasting=%{if var.swap_lasting_duration_major == null}None%{else}'${var.swap_lasting_duration_major}'%{endif}, at_least=${var.swap_at_least_percentage_major}) and (not when(signal > ${var.swap_threshold_critical}, lasting=%{if var.swap_lasting_duration_critical == null}None%{else}'${var.swap_lasting_duration_critical}'%{endif}, at_least=${var.swap_at_least_percentage_critical}))).publish('MAJOR') + detect(when(signal > ${var.swap_threshold_critical}, lasting=%{if var.swap_lasting_duration_critical == null}None%{else}'${var.swap_lasting_duration_critical}'%{endif}, at_least=${var.swap_at_least_percentage_critical})).publish('CRIT') EOF rule { - description = "is too high > ${var.swap_threshold_critical}" - severity = "Critical" - detect_label = "CRIT" - disabled = coalesce(var.swap_disabled_critical, var.swap_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.swap_notifications, "critical", []), var.notifications.critical), null) + description = "is too high > ${var.swap_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.swap_disabled_major, var.swap_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.swap_notifications, "major", []), var.notifications.major), null) runbook_url = try(coalesce(var.swap_runbook_url, var.runbook_url), "") tip = var.swap_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject @@ -146,11 +151,11 @@ EOF } rule { - description = "is too high > ${var.swap_threshold_major}" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.swap_disabled_major, var.swap_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.swap_notifications, "major", []), var.notifications.major), null) + description = "is too high > ${var.swap_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.swap_disabled_critical, var.swap_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.swap_notifications, "critical", []), var.notifications.critical), null) runbook_url = try(coalesce(var.swap_runbook_url, var.runbook_url), "") tip = var.swap_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject @@ -161,24 +166,25 @@ EOF } resource "signalfx_detector" "free_memory" { - name = format("%s %s", local.detector_name_prefix, "AWS ElastiCache freeable memory") + name = format("%s %s", local.detector_name_prefix, "AWS ElastiCache free memory") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - signal = data('FreeableMemory', filter=filter('namespace', 'AWS/ElastiCache') and filter('stat', 'lower') and filter('CacheNodeId', '*') and ${module.filtering.signalflow}).rateofchange()${var.free_memory_aggregation_function}${var.free_memory_transformation_function}.publish('signal') - detect(when(signal < ${var.free_memory_threshold_major})).publish('MAJOR') - detect(when(signal < ${var.free_memory_threshold_minor}) and (not when(signal < ${var.free_memory_threshold_major}))).publish('MINOR') + base_filtering = filter('namespace', 'AWS/ElastiCache') + signal = data('FreeableMemory', filter=base_filtering and filter('stat', 'lower') and filter('CacheNodeId', '*') and ${module.filtering.signalflow})${var.free_memory_aggregation_function}${var.free_memory_transformation_function}.publish('signal') + detect(when(signal < ${var.free_memory_threshold_minor}, lasting=%{if var.free_memory_lasting_duration_minor == null}None%{else}'${var.free_memory_lasting_duration_minor}'%{endif}, at_least=${var.free_memory_at_least_percentage_minor}) and (not when(signal < ${var.free_memory_threshold_major}, lasting=%{if var.free_memory_lasting_duration_major == null}None%{else}'${var.free_memory_lasting_duration_major}'%{endif}, at_least=${var.free_memory_at_least_percentage_major}))).publish('MINOR') + detect(when(signal < ${var.free_memory_threshold_major}, lasting=%{if var.free_memory_lasting_duration_major == null}None%{else}'${var.free_memory_lasting_duration_major}'%{endif}, at_least=${var.free_memory_at_least_percentage_major})).publish('MAJOR') EOF rule { - description = "is too low < ${var.free_memory_threshold_major}" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.free_memory_disabled_major, var.free_memory_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.free_memory_notifications, "major", []), var.notifications.major), null) + description = "is too low < ${var.free_memory_threshold_minor}" + severity = "Minor" + detect_label = "MINOR" + disabled = coalesce(var.free_memory_disabled_minor, var.free_memory_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.free_memory_notifications, "minor", []), var.notifications.minor), null) runbook_url = try(coalesce(var.free_memory_runbook_url, var.runbook_url), "") tip = var.free_memory_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject @@ -186,11 +192,11 @@ EOF } rule { - description = "is too low < ${var.free_memory_threshold_minor}" - severity = "Minor" - detect_label = "MINOR" - disabled = coalesce(var.free_memory_disabled_minor, var.free_memory_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.free_memory_notifications, "minor", []), var.notifications.minor), null) + description = "is too low < ${var.free_memory_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.free_memory_disabled_major, var.free_memory_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.free_memory_notifications, "major", []), var.notifications.major), null) runbook_url = try(coalesce(var.free_memory_runbook_url, var.runbook_url), "") tip = var.free_memory_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject @@ -201,24 +207,25 @@ EOF } resource "signalfx_detector" "evictions_growing" { - name = format("%s %s", local.detector_name_prefix, "AWS ElastiCache evictions changing rate grows") + name = format("%s %s", local.detector_name_prefix, "AWS ElastiCache evictions growing") authorized_writer_teams = var.authorized_writer_teams teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) program_text = <<-EOF - signal = data('Evictions', filter=filter('namespace', 'AWS/ElastiCache') and filter('stat', 'mean') and filter('CacheNodeId', '*') and ${module.filtering.signalflow})${var.evictions_growing_aggregation_function}${var.evictions_growing_transformation_function}.rateofchange().scale(100).publish('signal') - detect(when(signal > ${var.evictions_growing_threshold_critical})).publish('CRIT') - detect(when(signal > ${var.evictions_growing_threshold_major}) and (not when(signal > ${var.evictions_growing_threshold_critical}))).publish('MAJOR') + base_filtering = filter('namespace', 'AWS/ElastiCache') + signal = data('Evictions', filter=base_filtering and filter('stat', 'mean') and filter('CacheNodeId', '*') and ${module.filtering.signalflow})${var.evictions_growing_aggregation_function}${var.evictions_growing_transformation_function}.publish('signal') + detect(when(signal > ${var.evictions_growing_threshold_major}, lasting=%{if var.evictions_growing_lasting_duration_major == null}None%{else}'${var.evictions_growing_lasting_duration_major}'%{endif}, at_least=${var.evictions_growing_at_least_percentage_major}) and (not when(signal > ${var.evictions_growing_threshold_critical}, lasting=%{if var.evictions_growing_lasting_duration_critical == null}None%{else}'${var.evictions_growing_lasting_duration_critical}'%{endif}, at_least=${var.evictions_growing_at_least_percentage_critical}))).publish('MAJOR') + detect(when(signal > ${var.evictions_growing_threshold_critical}, lasting=%{if var.evictions_growing_lasting_duration_critical == null}None%{else}'${var.evictions_growing_lasting_duration_critical}'%{endif}, at_least=${var.evictions_growing_at_least_percentage_critical})).publish('CRIT') EOF rule { - description = "too fast > ${var.evictions_growing_threshold_critical}" - severity = "Critical" - detect_label = "CRIT" - disabled = coalesce(var.evictions_growing_disabled_critical, var.evictions_growing_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.evictions_growing_notifications, "critical", []), var.notifications.critical), null) + description = "is too high > ${var.evictions_growing_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.evictions_growing_disabled_major, var.evictions_growing_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.evictions_growing_notifications, "major", []), var.notifications.major), null) runbook_url = try(coalesce(var.evictions_growing_runbook_url, var.runbook_url), "") tip = var.evictions_growing_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject @@ -226,11 +233,11 @@ EOF } rule { - description = "too fast > ${var.evictions_growing_threshold_major}" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.evictions_growing_disabled_major, var.evictions_growing_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.evictions_growing_notifications, "major", []), var.notifications.major), null) + description = "is too high > ${var.evictions_growing_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.evictions_growing_disabled_critical, var.evictions_growing_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.evictions_growing_notifications, "critical", []), var.notifications.critical), null) runbook_url = try(coalesce(var.evictions_growing_runbook_url, var.runbook_url), "") tip = var.evictions_growing_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject diff --git a/modules/integration_aws-elasticache-common/variables.tf b/modules/integration_aws-elasticache-common/variables-gen.tf similarity index 73% rename from modules/integration_aws-elasticache-common/variables.tf rename to modules/integration_aws-elasticache-common/variables-gen.tf index f868a3803..39a0be532 100644 --- a/modules/integration_aws-elasticache-common/variables.tf +++ b/modules/integration_aws-elasticache-common/variables-gen.tf @@ -1,6 +1,16 @@ -# Module specific +# heartbeat detector -# Heartbeat detector +variable "heartbeat_notifications" { + description = "Notification recipients list per severity overridden for heartbeat detector" + type = map(list(string)) + default = {} +} + +variable "heartbeat_aggregation_function" { + description = "Aggregation function and group by for heartbeat detector (i.e. \".mean(by=['host'])\")" + type = string + default = ".mean(by=['CacheClusterId'])" +} variable "heartbeat_max_delay" { description = "Enforce max delay for heartbeat detector (use \"0\" or \"null\" for \"Auto\")" @@ -26,25 +36,31 @@ variable "heartbeat_disabled" { default = null } -variable "heartbeat_notifications" { - description = "Notification recipients list per severity overridden for heartbeat detector" - type = map(list(string)) - default = {} -} - variable "heartbeat_timeframe" { description = "Timeframe for heartbeat detector (i.e. \"10m\")" type = string default = "10m" } -variable "heartbeat_aggregation_function" { - description = "Aggregation function and group by for heartbeat detector (i.e. \".mean(by=['host'])\")" +# evictions detector + +variable "evictions_notifications" { + description = "Notification recipients list per severity overridden for evictions detector" + type = map(list(string)) + default = {} +} + +variable "evictions_aggregation_function" { + description = "Aggregation function and group by for evictions detector (i.e. \".mean(by=['host'])\")" type = string - default = ".mean(by=['CacheClusterId'])" + default = "" } -# Evictions detector +variable "evictions_transformation_function" { + description = "Transformation function for evictions detector (i.e. \".mean(over='5m')\")" + type = string + default = ".sum(over='15m')" +} variable "evictions_max_delay" { description = "Enforce max delay for evictions detector (use \"0\" or \"null\" for \"Auto\")" @@ -70,49 +86,71 @@ variable "evictions_disabled" { default = null } -variable "evictions_disabled_critical" { - description = "Disable critical alerting rule for evictions detector" +variable "evictions_disabled_major" { + description = "Disable major alerting rule for evictions detector" type = bool default = null } -variable "evictions_disabled_major" { - description = "Disable major alerting rule for evictions detector" +variable "evictions_disabled_critical" { + description = "Disable critical alerting rule for evictions detector" type = bool default = null } -variable "evictions_notifications" { - description = "Notification recipients list per severity overridden for evictions detector" - type = map(list(string)) - default = {} +variable "evictions_threshold_major" { + description = "Major threshold for evictions detector" + type = number + default = 0 } -variable "evictions_aggregation_function" { - description = "Aggregation function and group by for evictions detector (i.e. \".mean(by=['host'])\")" +variable "evictions_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" type = string - default = "" + default = null } -variable "evictions_transformation_function" { - description = "Transformation function for evictions detector (i.e. \".mean(over='5m')\")" - type = string - default = ".sum(over='15m')" +variable "evictions_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 } - variable "evictions_threshold_critical" { description = "Critical threshold for evictions detector" type = number default = 30 } -variable "evictions_threshold_major" { - description = "Major threshold for evictions detector" +variable "evictions_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "evictions_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number - default = 0 + default = 1 } +# max_connection detector -# Max_connection detector +variable "max_connection_notifications" { + description = "Notification recipients list per severity overridden for max_connection detector" + type = map(list(string)) + default = {} +} + +variable "max_connection_aggregation_function" { + description = "Aggregation function and group by for max_connection detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "max_connection_transformation_function" { + description = "Transformation function for max_connection detector (i.e. \".mean(over='5m')\")" + type = string + default = ".max(over='5m')" +} variable "max_connection_max_delay" { description = "Enforce max delay for max_connection detector (use \"0\" or \"null\" for \"Auto\")" @@ -138,38 +176,43 @@ variable "max_connection_disabled" { default = null } -variable "max_connection_disabled_critical" { - description = "Disable critical alerting rule for max_connection detector" - type = bool +variable "max_connection_threshold_critical" { + description = "Critical threshold for max_connection detector" + type = number + default = 64999 +} + +variable "max_connection_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string default = null } -variable "max_connection_notifications" { - description = "Notification recipients list per severity overridden for max_connection detector" +variable "max_connection_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# no_connection detector + +variable "no_connection_notifications" { + description = "Notification recipients list per severity overridden for no_connection detector" type = map(list(string)) default = {} } -variable "max_connection_aggregation_function" { - description = "Aggregation function and group by for max_connection detector (i.e. \".mean(by=['host'])\")" +variable "no_connection_aggregation_function" { + description = "Aggregation function and group by for no_connection detector (i.e. \".mean(by=['host'])\")" type = string default = "" } -variable "max_connection_transformation_function" { - description = "Transformation function for max_connection detector (i.e. \".mean(over='5m')\")" +variable "no_connection_transformation_function" { + description = "Transformation function for no_connection detector (i.e. \".mean(over='5m')\")" type = string - default = ".max(over='5m')" -} - -variable "max_connection_threshold_critical" { - description = "Critical threshold for max_connection detector" - type = number - default = 64999 + default = ".min(over='5m')" } -# No_connection detector - variable "no_connection_max_delay" { description = "Enforce max delay for no_connection detector (use \"0\" or \"null\" for \"Auto\")" type = number @@ -194,38 +237,43 @@ variable "no_connection_disabled" { default = null } -variable "no_connection_disabled_critical" { - description = "Disable critical alerting rule for no_connection detector" - type = bool +variable "no_connection_threshold_critical" { + description = "Critical threshold for no_connection detector" + type = number + default = 0 +} + +variable "no_connection_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string default = null } -variable "no_connection_notifications" { - description = "Notification recipients list per severity overridden for no_connection detector" +variable "no_connection_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# swap detector + +variable "swap_notifications" { + description = "Notification recipients list per severity overridden for swap detector" type = map(list(string)) default = {} } -variable "no_connection_aggregation_function" { - description = "Aggregation function and group by for no_connection detector (i.e. \".mean(by=['host'])\")" +variable "swap_aggregation_function" { + description = "Aggregation function and group by for swap detector (i.e. \".mean(by=['host'])\")" type = string default = "" } -variable "no_connection_transformation_function" { - description = "Transformation function for no_connection detector (i.e. \".mean(over='5m')\")" +variable "swap_transformation_function" { + description = "Transformation function for swap detector (i.e. \".mean(over='5m')\")" type = string default = ".min(over='5m')" } -variable "no_connection_threshold_critical" { - description = "Critical threshold for no_connection detector" - type = number - default = 0 -} - -# Swap detector - variable "swap_max_delay" { description = "Enforce max delay for swap detector (use \"0\" or \"null\" for \"Auto\")" type = number @@ -250,49 +298,71 @@ variable "swap_disabled" { default = null } -variable "swap_disabled_critical" { - description = "Disable critical alerting rule for swap detector" +variable "swap_disabled_major" { + description = "Disable major alerting rule for swap detector" type = bool default = null } -variable "swap_disabled_major" { - description = "Disable major alerting rule for swap detector" +variable "swap_disabled_critical" { + description = "Disable critical alerting rule for swap detector" type = bool default = null } -variable "swap_notifications" { - description = "Notification recipients list per severity overridden for swap detector" - type = map(list(string)) - default = {} +variable "swap_threshold_major" { + description = "Major threshold for swap detector" + type = number + default = 0 } -variable "swap_aggregation_function" { - description = "Aggregation function and group by for swap detector (i.e. \".mean(by=['host'])\")" +variable "swap_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" type = string - default = "" + default = null } -variable "swap_transformation_function" { - description = "Transformation function for swap detector (i.e. \".mean(over='5m')\")" - type = string - default = ".min(over='5m')" +variable "swap_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 } - variable "swap_threshold_critical" { description = "Critical threshold for swap detector" type = number default = 50000000 } -variable "swap_threshold_major" { - description = "Major threshold for swap detector" +variable "swap_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "swap_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number - default = 0 + default = 1 +} +# free_memory detector + +variable "free_memory_notifications" { + description = "Notification recipients list per severity overridden for free_memory detector" + type = map(list(string)) + default = {} } -# Free_memory detector +variable "free_memory_aggregation_function" { + description = "Aggregation function and group by for free_memory detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "free_memory_transformation_function" { + description = "Transformation function for free_memory detector (i.e. \".mean(over='5m')\")" + type = string + default = ".rateofchange().mean(over='15m')" +} variable "free_memory_max_delay" { description = "Enforce max delay for free_memory detector (use \"0\" or \"null\" for \"Auto\")" @@ -315,12 +385,6 @@ variable "free_memory_runbook_url" { variable "free_memory_disabled" { description = "Disable all alerting rules for free_memory detector" type = bool - default = true -} - -variable "free_memory_disabled_major" { - description = "Disable major alerting rule for free_memory detector" - type = bool default = null } @@ -330,37 +394,65 @@ variable "free_memory_disabled_minor" { default = null } -variable "free_memory_notifications" { - description = "Notification recipients list per severity overridden for free_memory detector" - type = map(list(string)) - default = {} +variable "free_memory_disabled_major" { + description = "Disable major alerting rule for free_memory detector" + type = bool + default = null } -variable "free_memory_aggregation_function" { - description = "Aggregation function and group by for free_memory detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" +variable "free_memory_threshold_minor" { + description = "Minor threshold for free_memory detector" + type = number + default = -50 } -variable "free_memory_transformation_function" { - description = "Transformation function for free_memory detector (i.e. \".mean(over='5m')\")" +variable "free_memory_lasting_duration_minor" { + description = "Minimum duration that conditions must be true before raising alert" type = string - default = ".mean(over='15m')" + default = null } +variable "free_memory_at_least_percentage_minor" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} variable "free_memory_threshold_major" { description = "Major threshold for free_memory detector" type = number default = -70 } -variable "free_memory_threshold_minor" { - description = "Minor threshold for free_memory detector" +variable "free_memory_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "free_memory_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number - default = -50 + default = 1 } +# evictions_growing detector -# Evictions_growing detector +variable "evictions_growing_notifications" { + description = "Notification recipients list per severity overridden for evictions_growing detector" + type = map(list(string)) + default = {} +} + +variable "evictions_growing_aggregation_function" { + description = "Aggregation function and group by for evictions_growing detector (i.e. \".mean(by=['host'])\")" + type = string + default = "" +} + +variable "evictions_growing_transformation_function" { + description = "Transformation function for evictions_growing detector (i.e. \".mean(over='5m')\")" + type = string + default = ".mean(over='5m').rateofchange().scale(100)" +} variable "evictions_growing_max_delay" { description = "Enforce max delay for evictions_growing detector (use \"0\" or \"null\" for \"Auto\")" @@ -386,45 +478,49 @@ variable "evictions_growing_disabled" { default = null } -variable "evictions_growing_disabled_critical" { - description = "Disable critical alerting rule for evictions_growing detector" +variable "evictions_growing_disabled_major" { + description = "Disable major alerting rule for evictions_growing detector" type = bool default = null } -variable "evictions_growing_disabled_major" { - description = "Disable major alerting rule for evictions_growing detector" +variable "evictions_growing_disabled_critical" { + description = "Disable critical alerting rule for evictions_growing detector" type = bool default = null } -variable "evictions_growing_notifications" { - description = "Notification recipients list per severity overridden for evictions_growing detector" - type = map(list(string)) - default = {} +variable "evictions_growing_threshold_major" { + description = "Major threshold for evictions_growing detector" + type = number + default = 10 } -variable "evictions_growing_aggregation_function" { - description = "Aggregation function and group by for evictions_growing detector (i.e. \".mean(by=['host'])\")" +variable "evictions_growing_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" type = string - default = "" + default = null } -variable "evictions_growing_transformation_function" { - description = "Transformation function for evictions_growing detector (i.e. \".mean(over='5m')\")" - type = string - default = ".mean(over='5m')" +variable "evictions_growing_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 } - variable "evictions_growing_threshold_critical" { description = "Critical threshold for evictions_growing detector" type = number default = 30 } -variable "evictions_growing_threshold_major" { - description = "Major threshold for evictions_growing detector" - type = number - default = 10 +variable "evictions_growing_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null } +variable "evictions_growing_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} diff --git a/modules/integration_aws-elasticsearch/README.md b/modules/integration_aws-elasticsearch/README.md index 5d3be1dc9..5f7dc30ca 100644 --- a/modules/integration_aws-elasticsearch/README.md +++ b/modules/integration_aws-elasticsearch/README.md @@ -59,7 +59,7 @@ Note the following parameters: These 3 parameters along with all variables defined in [common-variables.tf](common-variables.tf) are common to all [modules](../) in this repository. Other variables, specific to this module, are available in -[variables.tf](variables.tf) and [variables-gen.tf](variables-gen.tf). +[variables-gen.tf](variables-gen.tf). In general, the default configuration "works" but all of these Terraform [variables](https://www.terraform.io/language/values/variables) make it possible to customize the detectors behavior to better fit your needs. @@ -77,16 +77,16 @@ This module creates the following SignalFx detectors which could contain one or |Detector|Critical|Major|Minor|Warning|Info| |---|---|---|---|---|---| -|AWS ElasticSearch heartbeat|X|-|-|-|-| -|AWS ElasticSearch cluster status|X|X|-|-|-| -|AWS ElasticSearch cluster free storage space|X|X|-|-|-| -|AWS ElasticSearch cluster UltraWarm free storage space|X|X|-|-|-| -|AWS ElasticSearch cluster CPU|X|X|-|-|-| -|AWS ElasticSearch cluster Master nodes CPU|X|X|-|-|-| +|AWS Elasticsearch heartbeat|X|-|-|-|-| |AWS Elasticsearch jvm memory pressure|X|X|-|-|-| |AWS Elasticsearch 4xx http response|X|X|-|-|-| |AWS Elasticsearch 5xx http response|X|X|-|-|-| |AWS Elasticsearch shard count|X|X|-|-|-| +|AWS Elasticsearch cluster status|X|X|-|-|-| +|AWS Elasticsearch free space|X|X|-|-|-| +|AWS Elasticsearch ultrawarm free space|X|X|-|-|-| +|AWS Elasticsearch cluster cpu|X|X|-|-|-| +|AWS Elasticsearch master cpu|X|X|-|-|-| ## How to collect required metrics? @@ -110,6 +110,7 @@ Here is the list of required metrics for detectors in this module. * `ClusterStatus.red` * `ClusterStatus.yellow` * `CPUUtilization` +* `ElasticsearchRequests` * `FreeStorageSpace` * `JVMMemoryPressure` * `MasterCPUUtilization` diff --git a/modules/integration_aws-elasticsearch/common-filters.tf b/modules/integration_aws-elasticsearch/common-filters.tf deleted file mode 120000 index 9272cf517..000000000 --- a/modules/integration_aws-elasticsearch/common-filters.tf +++ /dev/null @@ -1 +0,0 @@ -../../common/module/filters-integration-aws.tf \ No newline at end of file diff --git a/modules/integration_aws-elasticsearch/common-filters.tf b/modules/integration_aws-elasticsearch/common-filters.tf new file mode 100644 index 000000000..cf7b9826c --- /dev/null +++ b/modules/integration_aws-elasticsearch/common-filters.tf @@ -0,0 +1,4 @@ +locals { + filters = "filter('aws_tag_env', '${var.environment}') and filter('aws_tag_sfx_monitored', 'true')" +} + diff --git a/modules/integration_aws-elasticsearch/common-locals.tf b/modules/integration_aws-elasticsearch/common-locals.tf deleted file mode 120000 index 5672d21ab..000000000 --- a/modules/integration_aws-elasticsearch/common-locals.tf +++ /dev/null @@ -1 +0,0 @@ -../../common/module/locals.tf \ No newline at end of file diff --git a/modules/integration_aws-elasticsearch/common-locals.tf b/modules/integration_aws-elasticsearch/common-locals.tf new file mode 100644 index 000000000..51a7650c1 --- /dev/null +++ b/modules/integration_aws-elasticsearch/common-locals.tf @@ -0,0 +1,44 @@ +locals { + heartbeat_auto_resolve_after = "1s" + not_running_vm_filters_gcp = "(not filter('gcp_status', '{Code=3, Name=STOPPING}', '{Code=4, Name=TERMINATED}'))" + not_running_vm_filters_aws = "(not filter('aws_state', '{Code: 32,Name: shutting-down}', '{Code: 48,Name: terminated}', '{Code: 64,Name: stopping}', '{Code: 80,Name: stopped}'))" + not_running_vm_filters_azure = "(not filter('azure_power_state', 'PowerState/stopping', 'PowerState/stopped', 'PowerState/deallocating', 'PowerState/deallocated'))" + not_running_vm_filters = format( + "%s and %s and %s", + local.not_running_vm_filters_aws, + local.not_running_vm_filters_gcp, + local.not_running_vm_filters_azure + ) + detector_name_prefix = "${join("", formatlist("[%s]", var.prefixes))}[${var.environment}]" + common_tags = concat(["terraform", var.environment], var.teams) + rule_subject_prefix = "[{{ruleSeverity}}]{{{detectorName}}} {{{readableRule}}}" + rule_subject_suffix = "on {{{dimensions}}}" + rule_subject = format("%s ({{inputs.signal.value}}) %s", local.rule_subject_prefix, local.rule_subject_suffix) + rule_subject_novalue = format("%s %s", local.rule_subject_prefix, local.rule_subject_suffix) + rule_body = <<-EOF + **Alert**: + *[{{ruleSeverity}}]{{{detectorName}}} {{{readableRule}}} ({{inputs.signal.value}})* + {{#if anomalous}} + **Triggered at**: + *{{timestamp}}* + {{else}} + **Cleared at**: + *{{timestamp}}* + {{/if}} + + {{#notEmpty dimensions}} + **Dimensions**: + *{{{dimensions}}}* + {{/notEmpty}} + + {{#if anomalous}} + {{#if runbookUrl}}**Runbook**: + Go to [this page]({{{runbookUrl}}}) for help and analysis. + {{/if}} + + {{#if tip}}**Tip**: + {{{tip}}} + {{/if}} + {{/if}} +EOF +} diff --git a/modules/integration_aws-elasticsearch/common-modules.tf b/modules/integration_aws-elasticsearch/common-modules.tf deleted file mode 120000 index 8c81ef377..000000000 --- a/modules/integration_aws-elasticsearch/common-modules.tf +++ /dev/null @@ -1 +0,0 @@ -../../common/module/modules.tf \ No newline at end of file diff --git a/modules/integration_aws-elasticsearch/common-modules.tf b/modules/integration_aws-elasticsearch/common-modules.tf new file mode 100644 index 000000000..79d068bdd --- /dev/null +++ b/modules/integration_aws-elasticsearch/common-modules.tf @@ -0,0 +1,8 @@ +module "filtering" { + source = "../internal_filtering" + + filtering_default = local.filters + filtering_custom = var.filtering_custom + append_mode = var.filtering_append +} + diff --git a/modules/integration_aws-elasticsearch/common-variables.tf b/modules/integration_aws-elasticsearch/common-variables.tf deleted file mode 120000 index f3037a584..000000000 --- a/modules/integration_aws-elasticsearch/common-variables.tf +++ /dev/null @@ -1 +0,0 @@ -../../common/module/variables.tf \ No newline at end of file diff --git a/modules/integration_aws-elasticsearch/common-variables.tf b/modules/integration_aws-elasticsearch/common-variables.tf new file mode 100644 index 000000000..80cc77eee --- /dev/null +++ b/modules/integration_aws-elasticsearch/common-variables.tf @@ -0,0 +1,78 @@ +# Global + +variable "environment" { + description = "Infrastructure environment" + type = string +} + +variable "notifications" { + description = "Default notification recipients list per severity" + type = object({ + critical = list(string) + major = list(string) + minor = list(string) + warning = list(string) + info = list(string) + }) +} + +variable "prefixes" { + description = "Prefixes list to prepend between brackets on every monitors names before environment" + type = list(string) + default = [] +} + +variable "filtering_custom" { + description = "Filters as SignalFlow string to either replace or append to default filtering convention which is the only one used if not defined" + type = string + default = null +} + +variable "filtering_append" { + description = "If true, the `filtering_custom` string will be appended to the default filtering convention instead of fully replace it" + type = bool + default = false +} + +variable "detectors_disabled" { + description = "Disable all detectors in this module" + type = bool + default = false +} + +variable "runbook_url" { + description = "Default runbook URL to apply to all detectors (if not overridden at detector level)" + type = string + default = "" +} + +variable "authorized_writer_teams" { + description = "List of teams IDs authorized (with admins) to edit the detector. If defined, it requires an user token to work" + type = list(string) + default = null +} + +variable "teams" { + description = "List of teams IDs to associate the detector to" + type = list(string) + default = [] +} + +variable "message_subject" { + description = "The subject to use in alerting rules messages which overrides the default template" + type = string + default = "" +} + +variable "message_body" { + description = "The body to use in alerting rules messages which overrides the default template" + type = string + default = "" +} + +variable "extra_tags" { + description = "List of tags to add to the detectors resources, useful to find detectors " + type = list(string) + default = [] +} + diff --git a/modules/integration_aws-elasticsearch/common-versions.tf b/modules/integration_aws-elasticsearch/common-versions.tf deleted file mode 120000 index fa7f5509f..000000000 --- a/modules/integration_aws-elasticsearch/common-versions.tf +++ /dev/null @@ -1 +0,0 @@ -../../common/module/versions.tf \ No newline at end of file diff --git a/modules/integration_aws-elasticsearch/common-versions.tf b/modules/integration_aws-elasticsearch/common-versions.tf new file mode 100644 index 000000000..d77818c04 --- /dev/null +++ b/modules/integration_aws-elasticsearch/common-versions.tf @@ -0,0 +1,9 @@ +terraform { + required_providers { + signalfx = { + source = "splunk-terraform/signalfx" + version = ">= 7.0.0" + } + } + required_version = ">= 0.12.26" +} diff --git a/modules/integration_aws-elasticsearch/conf/00-heartbeat.yaml b/modules/integration_aws-elasticsearch/conf/00-heartbeat.yaml new file mode 100644 index 000000000..94544dbb6 --- /dev/null +++ b/modules/integration_aws-elasticsearch/conf/00-heartbeat.yaml @@ -0,0 +1,13 @@ +module: AWS Elasticsearch +name: heartbeat + +transformation: false +aggregation: ".mean(by=['DomainName'])" +filtering: "filter('namespace', 'AWS/ES')" + +signals: + signal: + metric: Nodes + filter: "filter('stat', 'mean')" +rules: + critical: diff --git a/modules/integration_aws-elasticsearch/conf/03-5xx.yaml b/modules/integration_aws-elasticsearch/conf/03-5xx.yaml index 985c6322d..9e552304a 100644 --- a/modules/integration_aws-elasticsearch/conf/03-5xx.yaml +++ b/modules/integration_aws-elasticsearch/conf/03-5xx.yaml @@ -9,22 +9,47 @@ runbook_url: "https://docs.aws.amazon.com/opensearch-service/latest/developergui value_unit: "%" signals: - A: + error_stream: metric: 5xx - B: + open_search_stream: metric: OpenSearchRequests - signal: - formula: (A/B*100) + elastic_search_stream: + metric: ElasticsearchRequests + open_search_signal: + formula: (error_stream/open_search_stream*100) + publish: true + elastic_search_signal: + formula: (error_stream/elastic_search_stream*100) + publish: true rules: - critical: + critical_open_search: threshold: 10 comparator: ">" lasting_duration: "5m" lasting_at_least: 0.9 - major: + signal: open_search_signal + severity: critical + major_open_search: threshold: 5 comparator: ">" - dependency: critical + dependency: critical_open_search lasting_duration: "5m" lasting_at_least: 0.9 + signal: open_search_signal + severity: major + critical_elastic_search: + threshold: 10 + comparator: ">" + lasting_duration: "5m" + lasting_at_least: 0.9 + signal: elastic_search_signal + severity: critical + major_elastic_search: + threshold: 5 + comparator: ">" + dependency: critical_elastic_search + lasting_duration: "5m" + lasting_at_least: 0.9 + signal: elastic_search_signal + severity: major diff --git a/modules/integration_aws-elasticsearch/conf/05-cluster-status.yaml b/modules/integration_aws-elasticsearch/conf/05-cluster-status.yaml new file mode 100644 index 000000000..a1d25af5d --- /dev/null +++ b/modules/integration_aws-elasticsearch/conf/05-cluster-status.yaml @@ -0,0 +1,25 @@ +module: AWS Elasticsearch +name: Cluster Status + +aggregation: ".min(over='15m')" +filtering: "filter('namespace', 'AWS/ES') and filter('stat', 'upper')" + +signals: + red: + metric: ClusterStatus.red + publish: true + yellow: + metric: ClusterStatus.yellow + publish: true + +rules: + critical: + threshold: 1 + comparator: ">=" + description: "is red" + signal: red + major: + threshold: 1 + comparator: ">=" + description: "is yellow" + signal: yellow diff --git a/modules/integration_aws-elasticsearch/conf/06-free-space.yaml b/modules/integration_aws-elasticsearch/conf/06-free-space.yaml new file mode 100644 index 000000000..5917575e9 --- /dev/null +++ b/modules/integration_aws-elasticsearch/conf/06-free-space.yaml @@ -0,0 +1,21 @@ +module: AWS Elasticsearch +name: "Free space" + +transformation: ".scale(0.001)" +aggregation: "" + +filtering: "filter('namespace', 'AWS/ES')" +value_unit: "Gibibyte" + +signals: + signal: + metric: "FreeStorageSpace" + filter: "filter('stat', 'lower') and filter('NodeId', '*')" +rules: + major: + threshold: 40 + comparator: "<" + dependency: critical + critical: + threshold: 20 + comparator: "<" diff --git a/modules/integration_aws-elasticsearch/conf/07-ultrawarm-free-space.yaml b/modules/integration_aws-elasticsearch/conf/07-ultrawarm-free-space.yaml new file mode 100644 index 000000000..1b38eb28a --- /dev/null +++ b/modules/integration_aws-elasticsearch/conf/07-ultrawarm-free-space.yaml @@ -0,0 +1,21 @@ +module: AWS Elasticsearch +name: "Ultrawarm Free space" + +transformation: ".scale(0.001)" +aggregation: "" + +filtering: "filter('namespace', 'AWS/ES')" +value_unit: "Gibibyte" + +signals: + signal: + metric: "WarmFreeStorageSpace" + filter: "filter('stat', 'lower') and filter('NodeId', '*')" +rules: + major: + threshold: 15 + comparator: "<" + dependency: critical + critical: + threshold: 10 + comparator: "<" diff --git a/modules/integration_aws-elasticsearch/conf/08-cluster-cpu.yaml b/modules/integration_aws-elasticsearch/conf/08-cluster-cpu.yaml new file mode 100644 index 000000000..7a5260af6 --- /dev/null +++ b/modules/integration_aws-elasticsearch/conf/08-cluster-cpu.yaml @@ -0,0 +1,24 @@ +module: AWS Elasticsearch +name: "Cluster CPU" + +transformation: ".min(over='45m')" +aggregation: "" +filtering: "filter('namespace', 'AWS/ES') and filter('stat', 'upper') and filter('NodeId', '*')" + +signals: + data_node_cpu: + metric: CPUUtilization + warm_node_cpu: + metric: WarmCPUUtilization + signal: + formula: union(data_node_cpu, warm_node_cpu) + + +rules: + major: + threshold: 80 + comparator: ">" + dependency: critical + critical: + threshold: 90 + comparator: ">" diff --git a/modules/integration_aws-elasticsearch/conf/09-master-cpu.yaml b/modules/integration_aws-elasticsearch/conf/09-master-cpu.yaml new file mode 100644 index 000000000..64ca64c93 --- /dev/null +++ b/modules/integration_aws-elasticsearch/conf/09-master-cpu.yaml @@ -0,0 +1,19 @@ +module: AWS Elasticsearch +name: "Master CPU" + +transformation: ".min(over='20m')" +aggregation: "" +filtering: "filter('namespace', 'AWS/ES') and filter('stat', 'upper') and filter('NodeId', '*')" + +signals: + signal: + metric: MasterCPUUtilization + +rules: + major: + threshold: 50 + comparator: ">" + dependency: critical + critical: + threshold: 70 + comparator: ">" diff --git a/modules/integration_aws-elasticsearch/detectors-elasticsearch.tf b/modules/integration_aws-elasticsearch/detectors-elasticsearch.tf deleted file mode 100644 index a19d711a2..000000000 --- a/modules/integration_aws-elasticsearch/detectors-elasticsearch.tf +++ /dev/null @@ -1,240 +0,0 @@ -resource "signalfx_detector" "heartbeat" { - name = format("%s %s", local.detector_name_prefix, "AWS ElasticSearch heartbeat") - - authorized_writer_teams = var.authorized_writer_teams - teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) - tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) - - program_text = <<-EOF - from signalfx.detectors.not_reporting import not_reporting - signal = data('Nodes', filter=filter('namespace', 'AWS/ES') and filter('stat', 'mean') and ${module.filtering.signalflow})${var.heartbeat_aggregation_function}.publish('signal') - not_reporting.detector(stream=signal, resource_identifier=None, duration='${var.heartbeat_timeframe}', auto_resolve_after='${local.heartbeat_auto_resolve_after}').publish('CRIT') -EOF - - rule { - description = "has not reported in ${var.heartbeat_timeframe}" - severity = "Critical" - detect_label = "CRIT" - disabled = coalesce(var.heartbeat_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.heartbeat_notifications, "critical", []), var.notifications.critical), null) - runbook_url = try(coalesce(var.heartbeat_runbook_url, var.runbook_url), "") - tip = var.heartbeat_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject_novalue : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - max_delay = var.heartbeat_max_delay -} - -resource "signalfx_detector" "cluster_status" { - name = format("%s %s", local.detector_name_prefix, "AWS ElasticSearch cluster status") - - authorized_writer_teams = var.authorized_writer_teams - teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) - tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) - - program_text = <<-EOF - A = data('ClusterStatus.red', filter=filter('namespace', 'AWS/ES') and filter('stat', 'upper') and ${module.filtering.signalflow})${var.cluster_status_aggregation_function}${var.cluster_status_transformation_function}.publish('A') - B = data('ClusterStatus.yellow', filter=filter('namespace', 'AWS/ES') and filter('stat', 'upper') and ${module.filtering.signalflow})${var.cluster_status_aggregation_function}${var.cluster_status_transformation_function}.publish('B') - detect(when(A >= 1)).publish('CRIT') - detect(when(B >= 1)).publish('MAJOR') -EOF - - rule { - description = "is red" - severity = "Critical" - detect_label = "CRIT" - disabled = coalesce(var.cluster_status_disabled_critical, var.cluster_status_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.cluster_status_notifications, "critical", []), var.notifications.critical), null) - runbook_url = try(coalesce(var.cluster_status_runbook_url, var.runbook_url), "") - tip = var.cluster_status_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject_novalue : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - rule { - description = "is yellow" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.cluster_status_disabled_major, var.cluster_status_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.cluster_status_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.cluster_status_runbook_url, var.runbook_url), "") - tip = var.cluster_status_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject_novalue : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - max_delay = var.cluster_status_max_delay -} - -resource "signalfx_detector" "free_space" { - name = format("%s %s", local.detector_name_prefix, "AWS ElasticSearch cluster free storage space") - - authorized_writer_teams = var.authorized_writer_teams - teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) - tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) - - viz_options { - label = "signal" - value_unit = "Gibibyte" - } - - program_text = <<-EOF - signal = data('FreeStorageSpace', filter=filter('namespace', 'AWS/ES') and filter('stat', 'lower') and filter('NodeId', '*') and ${module.filtering.signalflow})${var.free_space_aggregation_function}${var.free_space_transformation_function}.scale(0.001).publish('signal') - detect(when(signal < ${var.free_space_threshold_critical})).publish('CRIT') - detect(when(signal < ${var.free_space_threshold_major}) and (not when(signal < ${var.free_space_threshold_critical}))).publish('MAJOR') -EOF - - rule { - description = "is too low < ${var.free_space_threshold_critical}" - severity = "Critical" - detect_label = "CRIT" - disabled = coalesce(var.free_space_disabled_critical, var.free_space_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.free_space_notifications, "critical", []), var.notifications.critical), null) - runbook_url = try(coalesce(var.free_space_runbook_url, var.runbook_url), "") - tip = var.free_space_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - rule { - description = "is too low < ${var.free_space_threshold_major}" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.free_space_disabled_major, var.free_space_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.free_space_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.free_space_runbook_url, var.runbook_url), "") - tip = var.free_space_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - max_delay = var.free_space_max_delay -} - -resource "signalfx_detector" "ultrawarm_free_space" { - name = format("%s %s", local.detector_name_prefix, "AWS ElasticSearch cluster UltraWarm free storage space") - - authorized_writer_teams = var.authorized_writer_teams - teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) - tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) - - viz_options { - label = "signal" - value_unit = "Gibibyte" - } - - program_text = <<-EOF - signal = data('WarmFreeStorageSpace', filter=filter('namespace', 'AWS/ES') and filter('stat', 'lower') and filter('NodeId', '*') and ${module.filtering.signalflow})${var.ultrawarm_free_space_aggregation_function}${var.ultrawarm_free_space_transformation_function}.scale(0.001).publish('signal') - detect(when(signal < ${var.ultrawarm_free_space_threshold_critical})).publish('CRIT') - detect(when(signal < ${var.ultrawarm_free_space_threshold_major}) and (not when(signal < ${var.ultrawarm_free_space_threshold_critical}))).publish('MAJOR') -EOF - - rule { - description = "is too low < ${var.ultrawarm_free_space_threshold_critical}" - severity = "Critical" - detect_label = "CRIT" - disabled = coalesce(var.ultrawarm_free_space_disabled_critical, var.ultrawarm_free_space_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.ultrawarm_free_space_notifications, "critical", []), var.notifications.critical), null) - runbook_url = try(coalesce(var.ultrawarm_free_space_runbook_url, var.runbook_url), "") - tip = var.ultrawarm_free_space_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - rule { - description = "is too low < ${var.ultrawarm_free_space_threshold_major}" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.ultrawarm_free_space_disabled_major, var.ultrawarm_free_space_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.ultrawarm_free_space_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.ultrawarm_free_space_runbook_url, var.runbook_url), "") - tip = var.ultrawarm_free_space_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - max_delay = var.ultrawarm_free_space_max_delay -} - -resource "signalfx_detector" "cpu_90_15min" { - name = format("%s %s", local.detector_name_prefix, "AWS ElasticSearch cluster CPU") - - authorized_writer_teams = var.authorized_writer_teams - teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) - tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) - - program_text = <<-EOF - data_node_cpu = data('CPUUtilization', filter=filter('namespace', 'AWS/ES') and filter('stat', 'upper') and filter('NodeId', '*') and ${module.filtering.signalflow})${var.cpu_90_15min_aggregation_function}${var.cpu_90_15min_transformation_function} - warm_node_cpu = data('WarmCPUUtilization', filter=filter('namespace', 'AWS/ES') and filter('stat', 'upper') and filter('NodeId', '*') and ${module.filtering.signalflow})${var.cpu_90_15min_aggregation_function}${var.cpu_90_15min_transformation_function} - signal = union(data_node_cpu, warm_node_cpu).publish('signal') - detect(when(signal > ${var.cpu_90_15min_threshold_critical})).publish('CRIT') - detect(when(signal > ${var.cpu_90_15min_threshold_major}) and (not when(signal > ${var.cpu_90_15min_threshold_critical}))).publish('MAJOR') -EOF - - rule { - description = "is too high > ${var.cpu_90_15min_threshold_critical}" - severity = "Critical" - detect_label = "CRIT" - disabled = coalesce(var.cpu_90_15min_disabled_critical, var.cpu_90_15min_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.cpu_90_15min_notifications, "critical", []), var.notifications.critical), null) - runbook_url = try(coalesce(var.cpu_90_15min_runbook_url, var.runbook_url), "") - tip = var.cpu_90_15min_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - rule { - description = "is too high > ${var.cpu_90_15min_threshold_major}" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.cpu_90_15min_disabled_major, var.cpu_90_15min_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.cpu_90_15min_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.cpu_90_15min_runbook_url, var.runbook_url), "") - tip = var.cpu_90_15min_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - max_delay = var.cpu_90_15min_max_delay -} - -resource "signalfx_detector" "master_cpu_90_15min" { - name = format("%s %s", local.detector_name_prefix, "AWS ElasticSearch cluster Master nodes CPU") - - authorized_writer_teams = var.authorized_writer_teams - teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) - tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) - - program_text = <<-EOF - signal = data('MasterCPUUtilization', filter=filter('namespace', 'AWS/ES') and filter('stat', 'upper') and filter('NodeId', '*') and ${module.filtering.signalflow})${var.master_cpu_90_15min_aggregation_function}${var.master_cpu_90_15min_transformation_function}.publish('signal') - detect(when(signal > ${var.master_cpu_90_15min_threshold_critical})).publish('CRIT') - detect(when(signal > ${var.master_cpu_90_15min_threshold_major}) and (not when(signal > ${var.master_cpu_90_15min_threshold_critical}))).publish('MAJOR') -EOF - - rule { - description = "is too high > ${var.master_cpu_90_15min_threshold_critical}" - severity = "Critical" - detect_label = "CRIT" - disabled = coalesce(var.master_cpu_90_15min_disabled_critical, var.master_cpu_90_15min_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.master_cpu_90_15min_notifications, "critical", []), var.notifications.critical), null) - runbook_url = try(coalesce(var.master_cpu_90_15min_runbook_url, var.runbook_url), "") - tip = var.master_cpu_90_15min_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - rule { - description = "is too high > ${var.master_cpu_90_15min_threshold_major}" - severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.master_cpu_90_15min_disabled_major, var.master_cpu_90_15min_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.master_cpu_90_15min_notifications, "major", []), var.notifications.major), null) - runbook_url = try(coalesce(var.master_cpu_90_15min_runbook_url, var.runbook_url), "") - tip = var.master_cpu_90_15min_tip - parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject - parameterized_body = var.message_body == "" ? local.rule_body : var.message_body - } - - max_delay = var.master_cpu_90_15min_max_delay -} diff --git a/modules/integration_aws-elasticsearch/detectors-gen.tf b/modules/integration_aws-elasticsearch/detectors-gen.tf index 252dcb106..6b46be85d 100644 --- a/modules/integration_aws-elasticsearch/detectors-gen.tf +++ b/modules/integration_aws-elasticsearch/detectors-gen.tf @@ -1,3 +1,32 @@ +resource "signalfx_detector" "heartbeat" { + name = format("%s %s", local.detector_name_prefix, "AWS Elasticsearch heartbeat") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + from signalfx.detectors.not_reporting import not_reporting + base_filtering = filter('namespace', 'AWS/ES') + signal = data('Nodes', filter=base_filtering and filter('stat', 'mean') and ${module.filtering.signalflow})${var.heartbeat_aggregation_function}.publish('signal') + not_reporting.detector(stream=signal, resource_identifier=None, duration='${var.heartbeat_timeframe}', auto_resolve_after='${local.heartbeat_auto_resolve_after}').publish('CRIT') +EOF + + rule { + description = "has not reported in ${var.heartbeat_timeframe}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.heartbeat_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.heartbeat_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.heartbeat_runbook_url, var.runbook_url), "") + tip = var.heartbeat_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject_novalue : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.heartbeat_max_delay +} + resource "signalfx_detector" "jvm_memory_pressure" { name = format("%s %s", local.detector_name_prefix, "AWS Elasticsearch jvm memory pressure") @@ -100,25 +129,29 @@ resource "signalfx_detector" "fivexx_http_response" { tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) viz_options { - label = "signal" + label = "elastic_search_signal" value_suffix = "%" } program_text = <<-EOF base_filtering = filter('namespace', 'AWS/ES') and filter('stat', 'sum') - A = data('5xx', filter=base_filtering and ${module.filtering.signalflow})${var.fivexx_http_response_aggregation_function}${var.fivexx_http_response_transformation_function} - B = data('OpenSearchRequests', filter=base_filtering and ${module.filtering.signalflow})${var.fivexx_http_response_aggregation_function}${var.fivexx_http_response_transformation_function} - signal = (A/B*100).publish('signal') - detect(when(signal > ${var.fivexx_http_response_threshold_critical}, lasting=%{if var.fivexx_http_response_lasting_duration_critical == null}None%{else}'${var.fivexx_http_response_lasting_duration_critical}'%{endif}, at_least=${var.fivexx_http_response_at_least_percentage_critical})).publish('CRIT') - detect(when(signal > ${var.fivexx_http_response_threshold_major}, lasting=%{if var.fivexx_http_response_lasting_duration_major == null}None%{else}'${var.fivexx_http_response_lasting_duration_major}'%{endif}, at_least=${var.fivexx_http_response_at_least_percentage_major}) and (not when(signal > ${var.fivexx_http_response_threshold_critical}, lasting=%{if var.fivexx_http_response_lasting_duration_critical == null}None%{else}'${var.fivexx_http_response_lasting_duration_critical}'%{endif}, at_least=${var.fivexx_http_response_at_least_percentage_critical}))).publish('MAJOR') + error_stream = data('5xx', filter=base_filtering and ${module.filtering.signalflow})${var.fivexx_http_response_aggregation_function}${var.fivexx_http_response_transformation_function} + open_search_stream = data('OpenSearchRequests', filter=base_filtering and ${module.filtering.signalflow})${var.fivexx_http_response_aggregation_function}${var.fivexx_http_response_transformation_function} + elastic_search_stream = data('ElasticsearchRequests', filter=base_filtering and ${module.filtering.signalflow})${var.fivexx_http_response_aggregation_function}${var.fivexx_http_response_transformation_function} + open_search_signal = (error_stream/open_search_stream*100).publish('open_search_signal') + elastic_search_signal = (error_stream/elastic_search_stream*100).publish('elastic_search_signal') + detect(when(open_search_signal > ${var.fivexx_http_response_threshold_critical_open_search}, lasting=%{if var.fivexx_http_response_lasting_duration_critical_open_search == null}None%{else}'${var.fivexx_http_response_lasting_duration_critical_open_search}'%{endif}, at_least=${var.fivexx_http_response_at_least_percentage_critical_open_search})).publish('CRIT_OPEN_SEARCH') + detect(when(open_search_signal > ${var.fivexx_http_response_threshold_major_open_search}, lasting=%{if var.fivexx_http_response_lasting_duration_major_open_search == null}None%{else}'${var.fivexx_http_response_lasting_duration_major_open_search}'%{endif}, at_least=${var.fivexx_http_response_at_least_percentage_major_open_search}) and (not when(open_search_signal > ${var.fivexx_http_response_threshold_critical_open_search}, lasting=%{if var.fivexx_http_response_lasting_duration_critical_open_search == null}None%{else}'${var.fivexx_http_response_lasting_duration_critical_open_search}'%{endif}, at_least=${var.fivexx_http_response_at_least_percentage_critical_open_search}))).publish('MAJOR_OPEN_SEARCH') + detect(when(elastic_search_signal > ${var.fivexx_http_response_threshold_critical_elastic_search}, lasting=%{if var.fivexx_http_response_lasting_duration_critical_elastic_search == null}None%{else}'${var.fivexx_http_response_lasting_duration_critical_elastic_search}'%{endif}, at_least=${var.fivexx_http_response_at_least_percentage_critical_elastic_search})).publish('CRIT_ELASTIC_SEARCH') + detect(when(elastic_search_signal > ${var.fivexx_http_response_threshold_major_elastic_search}, lasting=%{if var.fivexx_http_response_lasting_duration_major_elastic_search == null}None%{else}'${var.fivexx_http_response_lasting_duration_major_elastic_search}'%{endif}, at_least=${var.fivexx_http_response_at_least_percentage_major_elastic_search}) and (not when(elastic_search_signal > ${var.fivexx_http_response_threshold_critical_elastic_search}, lasting=%{if var.fivexx_http_response_lasting_duration_critical_elastic_search == null}None%{else}'${var.fivexx_http_response_lasting_duration_critical_elastic_search}'%{endif}, at_least=${var.fivexx_http_response_at_least_percentage_critical_elastic_search}))).publish('MAJOR_ELASTIC_SEARCH') EOF rule { - description = "is too high > ${var.fivexx_http_response_threshold_critical}%" + description = "is too high > ${var.fivexx_http_response_threshold_critical_open_search}%" severity = "Critical" - detect_label = "CRIT" - disabled = coalesce(var.fivexx_http_response_disabled_critical, var.fivexx_http_response_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.fivexx_http_response_notifications, "critical", []), var.notifications.critical), null) + detect_label = "CRIT_OPEN_SEARCH" + disabled = coalesce(var.fivexx_http_response_disabled_critical_open_search, var.fivexx_http_response_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.fivexx_http_response_notifications, "critical_open_search", []), var.notifications.critical_open_search), null) runbook_url = try(coalesce(var.fivexx_http_response_runbook_url, var.runbook_url), "") tip = var.fivexx_http_response_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject @@ -126,11 +159,35 @@ EOF } rule { - description = "is too high > ${var.fivexx_http_response_threshold_major}%" + description = "is too high > ${var.fivexx_http_response_threshold_major_open_search}%" severity = "Major" - detect_label = "MAJOR" - disabled = coalesce(var.fivexx_http_response_disabled_major, var.fivexx_http_response_disabled, var.detectors_disabled) - notifications = try(coalescelist(lookup(var.fivexx_http_response_notifications, "major", []), var.notifications.major), null) + detect_label = "MAJOR_OPEN_SEARCH" + disabled = coalesce(var.fivexx_http_response_disabled_major_open_search, var.fivexx_http_response_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.fivexx_http_response_notifications, "major_open_search", []), var.notifications.major_open_search), null) + runbook_url = try(coalesce(var.fivexx_http_response_runbook_url, var.runbook_url), "") + tip = var.fivexx_http_response_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.fivexx_http_response_threshold_critical_elastic_search}%" + severity = "Critical" + detect_label = "CRIT_ELASTIC_SEARCH" + disabled = coalesce(var.fivexx_http_response_disabled_critical_elastic_search, var.fivexx_http_response_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.fivexx_http_response_notifications, "critical_elastic_search", []), var.notifications.critical_elastic_search), null) + runbook_url = try(coalesce(var.fivexx_http_response_runbook_url, var.runbook_url), "") + tip = var.fivexx_http_response_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.fivexx_http_response_threshold_major_elastic_search}%" + severity = "Major" + detect_label = "MAJOR_ELASTIC_SEARCH" + disabled = coalesce(var.fivexx_http_response_disabled_major_elastic_search, var.fivexx_http_response_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.fivexx_http_response_notifications, "major_elastic_search", []), var.notifications.major_elastic_search), null) runbook_url = try(coalesce(var.fivexx_http_response_runbook_url, var.runbook_url), "") tip = var.fivexx_http_response_tip parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject @@ -182,3 +239,221 @@ EOF max_delay = var.shard_count_max_delay } +resource "signalfx_detector" "cluster_status" { + name = format("%s %s", local.detector_name_prefix, "AWS Elasticsearch cluster status") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('namespace', 'AWS/ES') and filter('stat', 'upper') + red = data('ClusterStatus.red', filter=base_filtering and ${module.filtering.signalflow})${var.cluster_status_aggregation_function}${var.cluster_status_transformation_function}.publish('red') + yellow = data('ClusterStatus.yellow', filter=base_filtering and ${module.filtering.signalflow})${var.cluster_status_aggregation_function}${var.cluster_status_transformation_function}.publish('yellow') + detect(when(red >= ${var.cluster_status_threshold_critical}, lasting=%{if var.cluster_status_lasting_duration_critical == null}None%{else}'${var.cluster_status_lasting_duration_critical}'%{endif}, at_least=${var.cluster_status_at_least_percentage_critical})).publish('CRIT') + detect(when(yellow >= ${var.cluster_status_threshold_major}, lasting=%{if var.cluster_status_lasting_duration_major == null}None%{else}'${var.cluster_status_lasting_duration_major}'%{endif}, at_least=${var.cluster_status_at_least_percentage_major})).publish('MAJOR') +EOF + + rule { + description = "is red >= ${var.cluster_status_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.cluster_status_disabled_critical, var.cluster_status_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.cluster_status_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.cluster_status_runbook_url, var.runbook_url), "") + tip = var.cluster_status_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is yellow >= ${var.cluster_status_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.cluster_status_disabled_major, var.cluster_status_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.cluster_status_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.cluster_status_runbook_url, var.runbook_url), "") + tip = var.cluster_status_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.cluster_status_max_delay +} + +resource "signalfx_detector" "free_space" { + name = format("%s %s", local.detector_name_prefix, "AWS Elasticsearch free space") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + viz_options { + label = "signal" + value_unit = "Gibibyte" + } + + program_text = <<-EOF + base_filtering = filter('namespace', 'AWS/ES') + signal = data('FreeStorageSpace', filter=base_filtering and filter('stat', 'lower') and filter('NodeId', '*') and ${module.filtering.signalflow})${var.free_space_transformation_function}.publish('signal') + detect(when(signal < ${var.free_space_threshold_major}, lasting=%{if var.free_space_lasting_duration_major == null}None%{else}'${var.free_space_lasting_duration_major}'%{endif}, at_least=${var.free_space_at_least_percentage_major}) and (not when(signal < ${var.free_space_threshold_critical}, lasting=%{if var.free_space_lasting_duration_critical == null}None%{else}'${var.free_space_lasting_duration_critical}'%{endif}, at_least=${var.free_space_at_least_percentage_critical}))).publish('MAJOR') + detect(when(signal < ${var.free_space_threshold_critical}, lasting=%{if var.free_space_lasting_duration_critical == null}None%{else}'${var.free_space_lasting_duration_critical}'%{endif}, at_least=${var.free_space_at_least_percentage_critical})).publish('CRIT') +EOF + + rule { + description = "is too low < ${var.free_space_threshold_major}Gibibyte" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.free_space_disabled_major, var.free_space_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.free_space_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.free_space_runbook_url, var.runbook_url), "") + tip = var.free_space_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too low < ${var.free_space_threshold_critical}Gibibyte" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.free_space_disabled_critical, var.free_space_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.free_space_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.free_space_runbook_url, var.runbook_url), "") + tip = var.free_space_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.free_space_max_delay +} + +resource "signalfx_detector" "ultrawarm_free_space" { + name = format("%s %s", local.detector_name_prefix, "AWS Elasticsearch ultrawarm free space") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + viz_options { + label = "signal" + value_unit = "Gibibyte" + } + + program_text = <<-EOF + base_filtering = filter('namespace', 'AWS/ES') + signal = data('WarmFreeStorageSpace', filter=base_filtering and filter('stat', 'lower') and filter('NodeId', '*') and ${module.filtering.signalflow})${var.ultrawarm_free_space_transformation_function}.publish('signal') + detect(when(signal < ${var.ultrawarm_free_space_threshold_major}, lasting=%{if var.ultrawarm_free_space_lasting_duration_major == null}None%{else}'${var.ultrawarm_free_space_lasting_duration_major}'%{endif}, at_least=${var.ultrawarm_free_space_at_least_percentage_major}) and (not when(signal < ${var.ultrawarm_free_space_threshold_critical}, lasting=%{if var.ultrawarm_free_space_lasting_duration_critical == null}None%{else}'${var.ultrawarm_free_space_lasting_duration_critical}'%{endif}, at_least=${var.ultrawarm_free_space_at_least_percentage_critical}))).publish('MAJOR') + detect(when(signal < ${var.ultrawarm_free_space_threshold_critical}, lasting=%{if var.ultrawarm_free_space_lasting_duration_critical == null}None%{else}'${var.ultrawarm_free_space_lasting_duration_critical}'%{endif}, at_least=${var.ultrawarm_free_space_at_least_percentage_critical})).publish('CRIT') +EOF + + rule { + description = "is too low < ${var.ultrawarm_free_space_threshold_major}Gibibyte" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.ultrawarm_free_space_disabled_major, var.ultrawarm_free_space_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.ultrawarm_free_space_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.ultrawarm_free_space_runbook_url, var.runbook_url), "") + tip = var.ultrawarm_free_space_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too low < ${var.ultrawarm_free_space_threshold_critical}Gibibyte" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.ultrawarm_free_space_disabled_critical, var.ultrawarm_free_space_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.ultrawarm_free_space_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.ultrawarm_free_space_runbook_url, var.runbook_url), "") + tip = var.ultrawarm_free_space_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.ultrawarm_free_space_max_delay +} + +resource "signalfx_detector" "cluster_cpu" { + name = format("%s %s", local.detector_name_prefix, "AWS Elasticsearch cluster cpu") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('namespace', 'AWS/ES') and filter('stat', 'upper') and filter('NodeId', '*') + data_node_cpu = data('CPUUtilization', filter=base_filtering and ${module.filtering.signalflow})${var.cluster_cpu_transformation_function} + warm_node_cpu = data('WarmCPUUtilization', filter=base_filtering and ${module.filtering.signalflow})${var.cluster_cpu_transformation_function} + signal = union(data_node_cpu, warm_node_cpu).publish('signal') + detect(when(signal > ${var.cluster_cpu_threshold_major}, lasting=%{if var.cluster_cpu_lasting_duration_major == null}None%{else}'${var.cluster_cpu_lasting_duration_major}'%{endif}, at_least=${var.cluster_cpu_at_least_percentage_major}) and (not when(signal > ${var.cluster_cpu_threshold_critical}, lasting=%{if var.cluster_cpu_lasting_duration_critical == null}None%{else}'${var.cluster_cpu_lasting_duration_critical}'%{endif}, at_least=${var.cluster_cpu_at_least_percentage_critical}))).publish('MAJOR') + detect(when(signal > ${var.cluster_cpu_threshold_critical}, lasting=%{if var.cluster_cpu_lasting_duration_critical == null}None%{else}'${var.cluster_cpu_lasting_duration_critical}'%{endif}, at_least=${var.cluster_cpu_at_least_percentage_critical})).publish('CRIT') +EOF + + rule { + description = "is too high > ${var.cluster_cpu_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.cluster_cpu_disabled_major, var.cluster_cpu_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.cluster_cpu_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.cluster_cpu_runbook_url, var.runbook_url), "") + tip = var.cluster_cpu_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.cluster_cpu_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.cluster_cpu_disabled_critical, var.cluster_cpu_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.cluster_cpu_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.cluster_cpu_runbook_url, var.runbook_url), "") + tip = var.cluster_cpu_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.cluster_cpu_max_delay +} + +resource "signalfx_detector" "master_cpu" { + name = format("%s %s", local.detector_name_prefix, "AWS Elasticsearch master cpu") + + authorized_writer_teams = var.authorized_writer_teams + teams = try(coalescelist(var.teams, var.authorized_writer_teams), null) + tags = compact(concat(local.common_tags, local.tags, var.extra_tags)) + + program_text = <<-EOF + base_filtering = filter('namespace', 'AWS/ES') and filter('stat', 'upper') and filter('NodeId', '*') + signal = data('MasterCPUUtilization', filter=base_filtering and ${module.filtering.signalflow})${var.master_cpu_transformation_function}.publish('signal') + detect(when(signal > ${var.master_cpu_threshold_major}, lasting=%{if var.master_cpu_lasting_duration_major == null}None%{else}'${var.master_cpu_lasting_duration_major}'%{endif}, at_least=${var.master_cpu_at_least_percentage_major}) and (not when(signal > ${var.master_cpu_threshold_critical}, lasting=%{if var.master_cpu_lasting_duration_critical == null}None%{else}'${var.master_cpu_lasting_duration_critical}'%{endif}, at_least=${var.master_cpu_at_least_percentage_critical}))).publish('MAJOR') + detect(when(signal > ${var.master_cpu_threshold_critical}, lasting=%{if var.master_cpu_lasting_duration_critical == null}None%{else}'${var.master_cpu_lasting_duration_critical}'%{endif}, at_least=${var.master_cpu_at_least_percentage_critical})).publish('CRIT') +EOF + + rule { + description = "is too high > ${var.master_cpu_threshold_major}" + severity = "Major" + detect_label = "MAJOR" + disabled = coalesce(var.master_cpu_disabled_major, var.master_cpu_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.master_cpu_notifications, "major", []), var.notifications.major), null) + runbook_url = try(coalesce(var.master_cpu_runbook_url, var.runbook_url), "") + tip = var.master_cpu_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + rule { + description = "is too high > ${var.master_cpu_threshold_critical}" + severity = "Critical" + detect_label = "CRIT" + disabled = coalesce(var.master_cpu_disabled_critical, var.master_cpu_disabled, var.detectors_disabled) + notifications = try(coalescelist(lookup(var.master_cpu_notifications, "critical", []), var.notifications.critical), null) + runbook_url = try(coalesce(var.master_cpu_runbook_url, var.runbook_url), "") + tip = var.master_cpu_tip + parameterized_subject = var.message_subject == "" ? local.rule_subject : var.message_subject + parameterized_body = var.message_body == "" ? local.rule_body : var.message_body + } + + max_delay = var.master_cpu_max_delay +} + diff --git a/modules/integration_aws-elasticsearch/moved.tf b/modules/integration_aws-elasticsearch/moved.tf new file mode 100644 index 000000000..db42ff2ac --- /dev/null +++ b/modules/integration_aws-elasticsearch/moved.tf @@ -0,0 +1,9 @@ +moved { + from = signalfx_detector.cpu_90_15min + to = signalfx_detector.cluster_cpu +} + +moved { + from = signalfx_detector.master_cpu_90_15min + to = signalfx_detector.master_cpu +} \ No newline at end of file diff --git a/modules/integration_aws-elasticsearch/outputs.tf b/modules/integration_aws-elasticsearch/outputs.tf index 639a784ee..e400f73d6 100644 --- a/modules/integration_aws-elasticsearch/outputs.tf +++ b/modules/integration_aws-elasticsearch/outputs.tf @@ -1,13 +1,13 @@ +output "cluster_cpu" { + description = "Detector resource for cluster_cpu" + value = signalfx_detector.cluster_cpu +} + output "cluster_status" { description = "Detector resource for cluster_status" value = signalfx_detector.cluster_status } -output "cpu_90_15min" { - description = "Detector resource for cpu_90_15min" - value = signalfx_detector.cpu_90_15min -} - output "fivexx_http_response" { description = "Detector resource for fivexx_http_response" value = signalfx_detector.fivexx_http_response @@ -33,9 +33,9 @@ output "jvm_memory_pressure" { value = signalfx_detector.jvm_memory_pressure } -output "master_cpu_90_15min" { - description = "Detector resource for master_cpu_90_15min" - value = signalfx_detector.master_cpu_90_15min +output "master_cpu" { + description = "Detector resource for master_cpu" + value = signalfx_detector.master_cpu } output "shard_count" { diff --git a/modules/integration_aws-elasticsearch/variables-gen.tf b/modules/integration_aws-elasticsearch/variables-gen.tf index 0c860fe26..5f6e36028 100644 --- a/modules/integration_aws-elasticsearch/variables-gen.tf +++ b/modules/integration_aws-elasticsearch/variables-gen.tf @@ -1,3 +1,47 @@ +# heartbeat detector + +variable "heartbeat_notifications" { + description = "Notification recipients list per severity overridden for heartbeat detector" + type = map(list(string)) + default = {} +} + +variable "heartbeat_aggregation_function" { + description = "Aggregation function and group by for heartbeat detector (i.e. \".mean(by=['host'])\")" + type = string + default = ".mean(by=['DomainName'])" +} + +variable "heartbeat_max_delay" { + description = "Enforce max delay for heartbeat detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = 900 +} + +variable "heartbeat_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "heartbeat_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "heartbeat_disabled" { + description = "Disable all alerting rules for heartbeat detector" + type = bool + default = null +} + +variable "heartbeat_timeframe" { + description = "Timeframe for heartbeat detector (i.e. \"10m\")" + type = string + default = "10m" +} + # jvm_memory_pressure detector variable "jvm_memory_pressure_notifications" { @@ -228,48 +272,94 @@ variable "fivexx_http_response_disabled" { default = null } -variable "fivexx_http_response_disabled_critical" { - description = "Disable critical alerting rule for fivexx_http_response detector" +variable "fivexx_http_response_disabled_critical_open_search" { + description = "Disable critical_open_search alerting rule for fivexx_http_response detector" type = bool default = null } -variable "fivexx_http_response_disabled_major" { - description = "Disable major alerting rule for fivexx_http_response detector" +variable "fivexx_http_response_disabled_major_open_search" { + description = "Disable major_open_search alerting rule for fivexx_http_response detector" type = bool default = null } -variable "fivexx_http_response_threshold_critical" { - description = "Critical threshold for fivexx_http_response detector in %" +variable "fivexx_http_response_disabled_critical_elastic_search" { + description = "Disable critical_elastic_search alerting rule for fivexx_http_response detector" + type = bool + default = null +} + +variable "fivexx_http_response_disabled_major_elastic_search" { + description = "Disable major_elastic_search alerting rule for fivexx_http_response detector" + type = bool + default = null +} + +variable "fivexx_http_response_threshold_critical_open_search" { + description = "Critical_open_search threshold for fivexx_http_response detector in %" type = number default = 10 } -variable "fivexx_http_response_lasting_duration_critical" { +variable "fivexx_http_response_lasting_duration_critical_open_search" { description = "Minimum duration that conditions must be true before raising alert" type = string default = "5m" } -variable "fivexx_http_response_at_least_percentage_critical" { +variable "fivexx_http_response_at_least_percentage_critical_open_search" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 0.9 } -variable "fivexx_http_response_threshold_major" { - description = "Major threshold for fivexx_http_response detector in %" +variable "fivexx_http_response_threshold_major_open_search" { + description = "Major_open_search threshold for fivexx_http_response detector in %" type = number default = 5 } -variable "fivexx_http_response_lasting_duration_major" { +variable "fivexx_http_response_lasting_duration_major_open_search" { description = "Minimum duration that conditions must be true before raising alert" type = string default = "5m" } -variable "fivexx_http_response_at_least_percentage_major" { +variable "fivexx_http_response_at_least_percentage_major_open_search" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 0.9 +} +variable "fivexx_http_response_threshold_critical_elastic_search" { + description = "Critical_elastic_search threshold for fivexx_http_response detector in %" + type = number + default = 10 +} + +variable "fivexx_http_response_lasting_duration_critical_elastic_search" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "5m" +} + +variable "fivexx_http_response_at_least_percentage_critical_elastic_search" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 0.9 +} +variable "fivexx_http_response_threshold_major_elastic_search" { + description = "Major_elastic_search threshold for fivexx_http_response detector in %" + type = number + default = 5 +} + +variable "fivexx_http_response_lasting_duration_major_elastic_search" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = "5m" +} + +variable "fivexx_http_response_at_least_percentage_major_elastic_search" { description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" type = number default = 0.9 @@ -366,3 +456,429 @@ variable "shard_count_at_least_percentage_major" { type = number default = 1 } +# cluster_status detector + +variable "cluster_status_notifications" { + description = "Notification recipients list per severity overridden for cluster_status detector" + type = map(list(string)) + default = {} +} + +variable "cluster_status_aggregation_function" { + description = "Aggregation function and group by for cluster_status detector (i.e. \".mean(by=['host'])\")" + type = string + default = ".min(over='15m')" +} + +variable "cluster_status_transformation_function" { + description = "Transformation function for cluster_status detector (i.e. \".mean(over='5m')\")" + type = string + default = "" +} + +variable "cluster_status_max_delay" { + description = "Enforce max delay for cluster_status detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "cluster_status_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "cluster_status_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "cluster_status_disabled" { + description = "Disable all alerting rules for cluster_status detector" + type = bool + default = null +} + +variable "cluster_status_disabled_critical" { + description = "Disable critical alerting rule for cluster_status detector" + type = bool + default = null +} + +variable "cluster_status_disabled_major" { + description = "Disable major alerting rule for cluster_status detector" + type = bool + default = null +} + +variable "cluster_status_threshold_critical" { + description = "Critical threshold for cluster_status detector" + type = number + default = 1 +} + +variable "cluster_status_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "cluster_status_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "cluster_status_threshold_major" { + description = "Major threshold for cluster_status detector" + type = number + default = 1 +} + +variable "cluster_status_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "cluster_status_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# free_space detector + +variable "free_space_notifications" { + description = "Notification recipients list per severity overridden for free_space detector" + type = map(list(string)) + default = {} +} + +variable "free_space_transformation_function" { + description = "Transformation function for free_space detector (i.e. \".mean(over='5m')\")" + type = string + default = ".scale(0.001)" +} + +variable "free_space_max_delay" { + description = "Enforce max delay for free_space detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "free_space_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "free_space_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "free_space_disabled" { + description = "Disable all alerting rules for free_space detector" + type = bool + default = null +} + +variable "free_space_disabled_major" { + description = "Disable major alerting rule for free_space detector" + type = bool + default = null +} + +variable "free_space_disabled_critical" { + description = "Disable critical alerting rule for free_space detector" + type = bool + default = null +} + +variable "free_space_threshold_major" { + description = "Major threshold for free_space detector in Gibibyte" + type = number + default = 40 +} + +variable "free_space_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "free_space_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "free_space_threshold_critical" { + description = "Critical threshold for free_space detector in Gibibyte" + type = number + default = 20 +} + +variable "free_space_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "free_space_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# ultrawarm_free_space detector + +variable "ultrawarm_free_space_notifications" { + description = "Notification recipients list per severity overridden for ultrawarm_free_space detector" + type = map(list(string)) + default = {} +} + +variable "ultrawarm_free_space_transformation_function" { + description = "Transformation function for ultrawarm_free_space detector (i.e. \".mean(over='5m')\")" + type = string + default = ".scale(0.001)" +} + +variable "ultrawarm_free_space_max_delay" { + description = "Enforce max delay for ultrawarm_free_space detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "ultrawarm_free_space_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "ultrawarm_free_space_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "ultrawarm_free_space_disabled" { + description = "Disable all alerting rules for ultrawarm_free_space detector" + type = bool + default = null +} + +variable "ultrawarm_free_space_disabled_major" { + description = "Disable major alerting rule for ultrawarm_free_space detector" + type = bool + default = null +} + +variable "ultrawarm_free_space_disabled_critical" { + description = "Disable critical alerting rule for ultrawarm_free_space detector" + type = bool + default = null +} + +variable "ultrawarm_free_space_threshold_major" { + description = "Major threshold for ultrawarm_free_space detector in Gibibyte" + type = number + default = 15 +} + +variable "ultrawarm_free_space_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "ultrawarm_free_space_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "ultrawarm_free_space_threshold_critical" { + description = "Critical threshold for ultrawarm_free_space detector in Gibibyte" + type = number + default = 10 +} + +variable "ultrawarm_free_space_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "ultrawarm_free_space_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# cluster_cpu detector + +variable "cluster_cpu_notifications" { + description = "Notification recipients list per severity overridden for cluster_cpu detector" + type = map(list(string)) + default = {} +} + +variable "cluster_cpu_transformation_function" { + description = "Transformation function for cluster_cpu detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='45m')" +} + +variable "cluster_cpu_max_delay" { + description = "Enforce max delay for cluster_cpu detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "cluster_cpu_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "cluster_cpu_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "cluster_cpu_disabled" { + description = "Disable all alerting rules for cluster_cpu detector" + type = bool + default = null +} + +variable "cluster_cpu_disabled_major" { + description = "Disable major alerting rule for cluster_cpu detector" + type = bool + default = null +} + +variable "cluster_cpu_disabled_critical" { + description = "Disable critical alerting rule for cluster_cpu detector" + type = bool + default = null +} + +variable "cluster_cpu_threshold_major" { + description = "Major threshold for cluster_cpu detector" + type = number + default = 80 +} + +variable "cluster_cpu_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "cluster_cpu_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "cluster_cpu_threshold_critical" { + description = "Critical threshold for cluster_cpu detector" + type = number + default = 90 +} + +variable "cluster_cpu_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "cluster_cpu_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +# master_cpu detector + +variable "master_cpu_notifications" { + description = "Notification recipients list per severity overridden for master_cpu detector" + type = map(list(string)) + default = {} +} + +variable "master_cpu_transformation_function" { + description = "Transformation function for master_cpu detector (i.e. \".mean(over='5m')\")" + type = string + default = ".min(over='20m')" +} + +variable "master_cpu_max_delay" { + description = "Enforce max delay for master_cpu detector (use \"0\" or \"null\" for \"Auto\")" + type = number + default = null +} + +variable "master_cpu_tip" { + description = "Suggested first course of action or any note useful for incident handling" + type = string + default = "" +} + +variable "master_cpu_runbook_url" { + description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" + type = string + default = "" +} + +variable "master_cpu_disabled" { + description = "Disable all alerting rules for master_cpu detector" + type = bool + default = null +} + +variable "master_cpu_disabled_major" { + description = "Disable major alerting rule for master_cpu detector" + type = bool + default = null +} + +variable "master_cpu_disabled_critical" { + description = "Disable critical alerting rule for master_cpu detector" + type = bool + default = null +} + +variable "master_cpu_threshold_major" { + description = "Major threshold for master_cpu detector" + type = number + default = 50 +} + +variable "master_cpu_lasting_duration_major" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "master_cpu_at_least_percentage_major" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} +variable "master_cpu_threshold_critical" { + description = "Critical threshold for master_cpu detector" + type = number + default = 70 +} + +variable "master_cpu_lasting_duration_critical" { + description = "Minimum duration that conditions must be true before raising alert" + type = string + default = null +} + +variable "master_cpu_at_least_percentage_critical" { + description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)" + type = number + default = 1 +} diff --git a/modules/integration_aws-elasticsearch/variables.tf b/modules/integration_aws-elasticsearch/variables.tf deleted file mode 100644 index 3b4f16f06..000000000 --- a/modules/integration_aws-elasticsearch/variables.tf +++ /dev/null @@ -1,373 +0,0 @@ -# Module specific - -# Heartbeat detector - -variable "heartbeat_max_delay" { - description = "Enforce max delay for heartbeat detector (use \"0\" or \"null\" for \"Auto\")" - type = number - default = 900 -} - -variable "heartbeat_tip" { - description = "Suggested first course of action or any note useful for incident handling" - type = string - default = "" -} - -variable "heartbeat_runbook_url" { - description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" - type = string - default = "" -} - -variable "heartbeat_disabled" { - description = "Disable all alerting rules for heartbeat detector" - type = bool - default = null -} - -variable "heartbeat_notifications" { - description = "Notification recipients list per severity overridden for heartbeat detector" - type = map(list(string)) - default = {} -} - -variable "heartbeat_timeframe" { - description = "Timeframe for heartbeat detector (i.e. \"10m\")" - type = string - default = "10m" -} - -variable "heartbeat_aggregation_function" { - description = "Aggregation function and group by for heartbeat detector (i.e. \".mean(by=['host'])\")" - type = string - default = ".mean(by=['DomainName'])" -} - -# Cluster_status detector - -variable "cluster_status_max_delay" { - description = "Enforce max delay for cluster_status detector (use \"0\" or \"null\" for \"Auto\")" - type = number - default = null -} - -variable "cluster_status_tip" { - description = "Suggested first course of action or any note useful for incident handling" - type = string - default = "" -} - -variable "cluster_status_runbook_url" { - description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" - type = string - default = "" -} - -variable "cluster_status_disabled" { - description = "Disable all alerting rules for cluster_status detector" - type = bool - default = null -} - -variable "cluster_status_disabled_critical" { - description = "Disable critical alerting rule for cluster_status detector" - type = bool - default = null -} - -variable "cluster_status_disabled_major" { - description = "Disable major alerting rule for cluster_status detector" - type = bool - default = null -} - -variable "cluster_status_notifications" { - description = "Notification recipients list per severity overridden for cluster_status detector" - type = map(list(string)) - default = {} -} - -variable "cluster_status_aggregation_function" { - description = "Aggregation function and group by for cluster_status detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" -} - -variable "cluster_status_transformation_function" { - description = "Transformation function for cluster_status detector (i.e. \".mean(over='5m')\")" - type = string - default = ".min(over='15m')" -} - -# Free_space detector - -variable "free_space_max_delay" { - description = "Enforce max delay for free_space detector (use \"0\" or \"null\" for \"Auto\")" - type = number - default = null -} - -variable "free_space_tip" { - description = "Suggested first course of action or any note useful for incident handling" - type = string - default = "" -} - -variable "free_space_runbook_url" { - description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" - type = string - default = "" -} - -variable "free_space_disabled" { - description = "Disable all alerting rules for free_space detector" - type = bool - default = null -} - -variable "free_space_disabled_critical" { - description = "Disable critical alerting rule for free_space detector" - type = bool - default = null -} - -variable "free_space_disabled_major" { - description = "Disable major alerting rule for free_space detector" - type = bool - default = null -} - -variable "free_space_notifications" { - description = "Notification recipients list per severity overridden for free_space detector" - type = map(list(string)) - default = {} -} - -variable "free_space_aggregation_function" { - description = "Aggregation function and group by for free_space detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" -} - -variable "free_space_transformation_function" { - description = "Transformation function for free_space detector (i.e. \".mean(over='5m')\")" - type = string - default = "" -} - -variable "free_space_threshold_critical" { - description = "Critical threshold for free_space detector" - type = number - default = 20 -} - -variable "free_space_threshold_major" { - description = "Major threshold for free_space detector" - type = number - default = 40 -} - -# ultrawarm_free_space detector - -variable "ultrawarm_free_space_max_delay" { - description = "Enforce max delay for ultrawarm_free_space detector (use \"0\" or \"null\" for \"Auto\")" - type = number - default = null -} - -variable "ultrawarm_free_space_tip" { - description = "Suggested first course of action or any note useful for incident handling" - type = string - default = "" -} - -variable "ultrawarm_free_space_runbook_url" { - description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" - type = string - default = "" -} - -variable "ultrawarm_free_space_disabled" { - description = "Disable all alerting rules for ultrawarm_free_space detector" - type = bool - default = null -} - -variable "ultrawarm_free_space_disabled_critical" { - description = "Disable critical alerting rule for ultrawarm_free_space detector" - type = bool - default = null -} - -variable "ultrawarm_free_space_disabled_major" { - description = "Disable major alerting rule for ultrawarm_free_space detector" - type = bool - default = null -} - -variable "ultrawarm_free_space_notifications" { - description = "Notification recipients list per severity overridden for ultrawarm_free_space detector" - type = map(list(string)) - default = {} -} - -variable "ultrawarm_free_space_aggregation_function" { - description = "Aggregation function and group by for ultrawarm_free_space detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" -} - -variable "ultrawarm_free_space_transformation_function" { - description = "Transformation function for ultrawarm_free_space detector (i.e. \".mean(over='5m')\")" - type = string - default = "" -} - -variable "ultrawarm_free_space_threshold_critical" { - description = "Critical threshold for ultrawarm_free_space detector" - type = number - default = 10 -} - -variable "ultrawarm_free_space_threshold_major" { - description = "Major threshold for ultrawarm_free_space detector" - type = number - default = 15 -} - -# CPU_90_15min detector - -variable "cpu_90_15min_max_delay" { - description = "Enforce max delay for cpu_90_15min detector (use \"0\" or \"null\" for \"Auto\")" - type = number - default = null -} - -variable "cpu_90_15min_tip" { - description = "Suggested first course of action or any note useful for incident handling" - type = string - default = "" -} - -variable "cpu_90_15min_runbook_url" { - description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" - type = string - default = "" -} - -variable "cpu_90_15min_disabled" { - description = "Disable all alerting rules for cpu_90_15min detector" - type = bool - default = null -} - -variable "cpu_90_15min_disabled_critical" { - description = "Disable critical alerting rule for cpu_90_15min detector" - type = bool - default = null -} - -variable "cpu_90_15min_disabled_major" { - description = "Disable major alerting rule for cpu_90_15min detector" - type = bool - default = null -} - -variable "cpu_90_15min_notifications" { - description = "Notification recipients list per severity overridden for cpu_90_15min detector" - type = map(list(string)) - default = {} -} - -variable "cpu_90_15min_aggregation_function" { - description = "Aggregation function and group by for cpu_90_15min detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" -} - -variable "cpu_90_15min_transformation_function" { - description = "Transformation function for cpu_90_15min detector (i.e. \".mean(over='5m')\")" - type = string - default = ".min(over='45m')" -} - -variable "cpu_90_15min_threshold_critical" { - description = "Critical threshold for cpu_90_15min detector" - type = number - default = 90 -} - -variable "cpu_90_15min_threshold_major" { - description = "Major threshold for cpu_90_15min detector" - type = number - default = 80 -} - -# master_cpu_90_15min detector - -variable "master_cpu_90_15min_max_delay" { - description = "Enforce max delay for master_cpu_90_15min detector (use \"0\" or \"null\" for \"Auto\")" - type = number - default = null -} - -variable "master_cpu_90_15min_tip" { - description = "Suggested first course of action or any note useful for incident handling" - type = string - default = "" -} - -variable "master_cpu_90_15min_runbook_url" { - description = "URL like SignalFx dashboard or wiki page which can help to troubleshoot the incident cause" - type = string - default = "" -} - -variable "master_cpu_90_15min_disabled" { - description = "Disable all alerting rules for master_cpu_90_15min detector" - type = bool - default = null -} - -variable "master_cpu_90_15min_disabled_critical" { - description = "Disable critical alerting rule for master_cpu_90_15min detector" - type = bool - default = null -} - -variable "master_cpu_90_15min_disabled_major" { - description = "Disable major alerting rule for master_cpu_90_15min detector" - type = bool - default = null -} - -variable "master_cpu_90_15min_notifications" { - description = "Notification recipients list per severity overridden for master_cpu_90_15min detector" - type = map(list(string)) - default = {} -} - -variable "master_cpu_90_15min_aggregation_function" { - description = "Aggregation function and group by for master_cpu_90_15min detector (i.e. \".mean(by=['host'])\")" - type = string - default = "" -} - -variable "master_cpu_90_15min_transformation_function" { - description = "Transformation function for master_cpu_90_15min detector (i.e. \".mean(over='5m')\")" - type = string - default = ".min(over='20m')" -} - -variable "master_cpu_90_15min_threshold_critical" { - description = "Critical threshold for master_cpu_90_15min detector" - type = number - default = 70 -} - -variable "master_cpu_90_15min_threshold_major" { - description = "Major threshold for master_cpu_90_15min detector" - type = number - default = 50 -} diff --git a/scripts/templates/detector.tf.j2 b/scripts/templates/detector.tf.j2 index e0dcaa28c..8dd38f066 100644 --- a/scripts/templates/detector.tf.j2 +++ b/scripts/templates/detector.tf.j2 @@ -55,12 +55,12 @@ resource "signalfx_detector" "{{ id }}" { {%- if 'formula' in signal %} {{ key }} = {{ signal.formula }} {%- endif -%} - {%- if loop.last -%} + {%- if loop.last or ('publish' in signal and signal.publish) -%} .publish('{{ key }}') {%- endif %} {%- endfor %} {%- macro macro_when(severity, rule) -%} - when(signal {{ rule.comparator }} ${var.{{ id }}_threshold_{{ severity }}}, lasting=%{if var.{{ id }}_lasting_duration_{{ severity }} == null}None%{else}'${var.{{ id }}_lasting_duration_{{ severity }}}'%{endif}, at_least=${var.{{ id }}_at_least_percentage_{{ severity }}}){% if rule.append_condition is string %} {{ rule.append_condition }}{% endif %} + when({{ rule.signal if rule.signal is string else 'signal' }} {{ rule.comparator }} ${var.{{ id }}_threshold_{{ severity }}}, lasting=%{if var.{{ id }}_lasting_duration_{{ severity }} == null}None%{else}'${var.{{ id }}_lasting_duration_{{ severity }}}'%{endif}, at_least=${var.{{ id }}_at_least_percentage_{{ severity }}}){% if rule.append_condition is string %} {{ rule.append_condition }}{% endif %} {%- endmacro -%} {%- for severity, rule in rules.items() %} {%- if type == 'heartbeat' -%} @@ -93,7 +93,7 @@ EOF {%- endif %} description = "{{ compare_string }} {{ rule.comparator }} ${var.{{ id }}_threshold_{{ severity }}}{{ value_unit | default("") }}" {%- endif %} - severity = "{{ severity | capitalize }}" + severity = "{{ (rule.severity if rule.severity is string else severity) | capitalize }}" detect_label = "{{ severity | replace('critical', 'crit') | replace ('warning', 'warn') | upper }}" disabled = coalesce({% if rules | length > 1 %}var.{{ id }}_disabled_{{ severity }}, {% endif %}var.{{ id }}_disabled, var.detectors_disabled) notifications = try(coalescelist(lookup(var.{{ id }}_notifications, "{{ severity }}", []), var.notifications.{{ severity }}), null)