Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a duration for which conditions must be true before triggering an alert on Integration_aws-rds-common #550

Merged
merged 3 commits into from
Apr 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions modules/integration_aws-rds-common/detectors-rds-common.tf
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ resource "signalfx_detector" "cpu_90_15min" {

program_text = <<-EOF
signal = data('CPUUtilization', filter=filter('namespace', 'AWS/RDS') and filter('stat', 'mean') and filter('DBInstanceIdentifier', '*') and ${module.filtering.signalflow})${var.cpu_90_15min_aggregation_function}${var.cpu_90_15min_transformation_function}.publish('signal')
detect(when(signal > ${var.cpu_90_15min_threshold_critical})).publish('CRIT')
detect(when(signal > ${var.cpu_90_15min_threshold_major}) and (not when(signal > ${var.cpu_90_15min_threshold_critical}))).publish('MAJOR')
detect(when(signal > ${var.cpu_90_15min_threshold_critical}, lasting=%{if var.cpu_90_15min_lasting_duration_critical == null}None%{else}'${var.cpu_90_15min_lasting_duration_critical}'%{endif}, at_least=${var.cpu_90_15min_at_least_percentage_critical})).publish('CRIT')
pdecat marked this conversation as resolved.
Show resolved Hide resolved
detect(when(signal > ${var.cpu_90_15min_threshold_major}, lasting=%{if var.cpu_90_15min_lasting_duration_major == null}None%{else}'${var.cpu_90_15min_lasting_duration_major}'%{endif}, at_least=${var.cpu_90_15min_at_least_percentage_major}) and (not when(signal > ${var.cpu_90_15min_threshold_critical}, lasting=%{if var.cpu_90_15min_lasting_duration_critical == null}None%{else}'${var.cpu_90_15min_lasting_duration_critical}'%{endif}, at_least=${var.cpu_90_15min_at_least_percentage_critical}))).publish('MAJOR')
EOF

rule {
Expand Down Expand Up @@ -81,8 +81,8 @@ resource "signalfx_detector" "free_space_low" {
program_text = <<-EOF
free = data('FreeStorageSpace', filter=filter('namespace', 'AWS/RDS') and filter('stat', 'mean') and filter('DBInstanceIdentifier', '*') and ${module.filtering.signalflow})${var.free_space_low_aggregation_function}${var.free_space_low_transformation_function}
signal = free.scale(1/1024**3).publish('signal') # Bytes to Gibibytes
detect(when(signal < ${var.free_space_low_threshold_critical})).publish('CRIT')
detect(when(signal < ${var.free_space_low_threshold_major}) and (not when(signal < ${var.free_space_low_threshold_critical}))).publish('MAJOR')
detect(when(signal < ${var.free_space_low_threshold_critical}, lasting=%{if var.free_space_low_lasting_duration_critical == null}None%{else}'${var.free_space_low_lasting_duration_critical}'%{endif}, at_least=${var.free_space_low_at_least_percentage_critical})).publish('CRIT')
detect(when(signal < ${var.free_space_low_threshold_major}, lasting=%{if var.free_space_low_lasting_duration_major == null}None%{else}'${var.free_space_low_lasting_duration_major}'%{endif}, at_least=${var.free_space_low_at_least_percentage_major}) and (not when(signal < ${var.free_space_low_threshold_critical}, lasting=%{if var.free_space_low_lasting_duration_critical == null}None%{else}'${var.free_space_low_lasting_duration_critical}'%{endif}, at_least=${var.free_space_low_at_least_percentage_critical}))).publish('MAJOR')
EOF

rule {
Expand Down Expand Up @@ -121,8 +121,8 @@ resource "signalfx_detector" "replica_lag" {

program_text = <<-EOF
signal = data('ReplicaLag', filter=filter('namespace', 'AWS/RDS') and filter('stat', 'mean') and filter('DBInstanceIdentifier', '*') and ${module.filtering.signalflow})${var.replica_lag_aggregation_function}${var.replica_lag_transformation_function}.publish('signal')
detect(when(signal > ${var.replica_lag_threshold_critical})).publish('CRIT')
detect(when(signal > ${var.replica_lag_threshold_major}) and (not when(signal > ${var.replica_lag_threshold_critical}))).publish('MAJOR')
detect(when(signal > ${var.replica_lag_threshold_critical}, lasting=%{if var.replica_lag_lasting_duration_critical == null}None%{else}'${var.replica_lag_lasting_duration_critical}'%{endif}, at_least=${var.replica_lag_at_least_percentage_critical})).publish('CRIT')
detect(when(signal > ${var.replica_lag_threshold_major}, lasting=%{if var.replica_lag_lasting_duration_major == null}None%{else}'${var.replica_lag_lasting_duration_major}'%{endif}, at_least=${var.replica_lag_at_least_percentage_major}) and (not when(signal > ${var.replica_lag_threshold_critical}, lasting=%{if var.replica_lag_lasting_duration_critical == null}None%{else}'${var.replica_lag_lasting_duration_critical}'%{endif}, at_least=${var.replica_lag_at_least_percentage_critical}))).publish('MAJOR')
EOF

rule {
Expand Down
71 changes: 71 additions & 0 deletions modules/integration_aws-rds-common/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,30 @@ variable "cpu_90_15min_threshold_major" {
default = 80
}

variable "cpu_90_15min_lasting_duration_critical" {
description = "Minimum duration that conditions must be true before raising alert"
type = string
default = null
}

variable "cpu_90_15min_at_least_percentage_critical" {
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
type = number
default = 1
}

variable "cpu_90_15min_lasting_duration_major" {
description = "Minimum duration that conditions must be true before raising alert"
type = string
default = null
}

variable "cpu_90_15min_at_least_percentage_major" {
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
type = number
default = 1
}

# Free_space_low detector

variable "free_space_low_max_delay" {
Expand Down Expand Up @@ -180,6 +204,30 @@ variable "free_space_low_threshold_major" {
default = 40
}

variable "free_space_low_lasting_duration_critical" {
description = "Minimum duration that conditions must be true before raising alert"
type = string
default = null
}

variable "free_space_low_at_least_percentage_critical" {
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
type = number
default = 1
}

variable "free_space_low_lasting_duration_major" {
description = "Minimum duration that conditions must be true before raising alert"
type = string
default = null
}

variable "free_space_low_at_least_percentage_major" {
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
type = number
default = 1
}

# Replica_lag detector

variable "replica_lag_max_delay" {
Expand Down Expand Up @@ -248,3 +296,26 @@ variable "replica_lag_threshold_major" {
default = 200
}

variable "replica_lag_lasting_duration_critical" {
description = "Minimum duration that conditions must be true before raising alert"
type = string
default = null
}

variable "replica_lag_at_least_percentage_critical" {
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
type = number
default = 1
}

variable "replica_lag_lasting_duration_major" {
description = "Minimum duration that conditions must be true before raising alert"
type = string
default = null
}

variable "replica_lag_at_least_percentage_major" {
description = "Percentage of lasting that conditions must be true before raising alert (>= 0.0 and <= 1.0)"
type = number
default = 1
}
Loading