Skip to content

Commit

Permalink
feat: terraform for alerting on metrics (#10192)
Browse files Browse the repository at this point in the history
Initial terraform for grafana/prometheus alerts. 

It comes pre-packaged with 1 rule that fires if the proven chain has not
advanced in 30 minutes.

Note: I've deployed this locally to my kind cluster, but for prod this
needs to be deployed by someone who has access to the slack webhook URL.
Generally, we ought to have a CI job that deploys our metric stack to
prod. To that end, I filed
#10191

fix #9956
  • Loading branch information
just-mitch authored Dec 2, 2024
1 parent a9d418c commit 05c9e5d
Show file tree
Hide file tree
Showing 2 changed files with 138 additions and 0 deletions.
127 changes: 127 additions & 0 deletions spartan/metrics/terraform/grafana.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
# See https://registry.terraform.io/providers/grafana/grafana/latest/docs

terraform {
required_providers {
grafana = {
source = "grafana/grafana"
version = "~> 3.13.2"
}
}
}

provider "grafana" {
url = var.grafana_url
auth = var.grafana_auth
}

resource "grafana_folder" "rule_folder" {
title = "Alerting Rules"
}


resource "grafana_contact_point" "slack" {
name = "slack"

slack {
url = var.slack_url
}
}

resource "grafana_notification_policy" "ignore_policy" {
contact_point = grafana_contact_point.slack.name
group_by = ["service_namespace"]

policy {
contact_point = grafana_contact_point.slack.name

matcher {
label = "service_namespace"
match = "="
value = "smoke"
}

mute_timings = ["always"]
}
}

resource "grafana_mute_timing" "mute_timing_always" {
name = "always"

intervals {
}
}

resource "grafana_rule_group" "rule_group_minutely" {
org_id = 1
name = "minutely-evaluation-group"
folder_uid = grafana_folder.rule_folder.uid
interval_seconds = 60

rule {
name = "Proven Chain is Live"
condition = "B"

data {
ref_id = "A"

relative_time_range {
from = 600
to = 0
}

datasource_uid = "spartan-metrics-prometheus"
model = jsonencode({
disableTextWrap = false,
editorMode = "code",
expr = "increase(aztec_archiver_block_height{aztec_status=\"proven\"}[30m])",
fullMetaSearch = false,
includeNullMetadata = true,
instant = true,
intervalMs = 1000,
legendFormat = "__auto",
maxDataPoints = 43200,
range = false,
refId = "A",
useBackend = false

})
}
data {
ref_id = "B"

relative_time_range {
from = 600
to = 0
}

datasource_uid = "__expr__"
model = jsonencode(
{
conditions = [
{
evaluator = { params = [1], type = "lt" },
operator = { type = "and" },
query = { params = ["C"] },
reducer = { params = [], type = "last" },
type = "query"
}
],
datasource = { type = "__expr__", uid = "__expr__" },
expression = "A",
intervalMs = 1000,
maxDataPoints = 43200,
refId = "C",
type = "threshold"
}
)
}

no_data_state = "NoData"
exec_err_state = "Error"
for = "1m"
annotations = {}
labels = {}
is_paused = false
}

}
11 changes: 11 additions & 0 deletions spartan/metrics/terraform/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
variable "grafana_url" {
type = string
}

variable "grafana_auth" {
type = string
}

variable "slack_url" {
type = string
}

0 comments on commit 05c9e5d

Please sign in to comment.