From 4a52e2dfeed018e7518c499ce00b47393a6c6a3f Mon Sep 17 00:00:00 2001 From: Leonid Evdokimov Date: Wed, 31 Oct 2018 17:38:57 +0300 Subject: [PATCH] Alert on high duration of high IO rate This also fixes hkgsuperset.ooni.io missing from `dom0` and `hkg` inventory groups. See #155 and #226 --- ansible/inventory | 2 ++ ansible/inventory-check.yml | 10 ++++++++++ ansible/roles/prometheus/files/alert_rules.yml | 8 ++++++++ 3 files changed, 20 insertions(+) diff --git a/ansible/inventory b/ansible/inventory index 975a628e..151e64c5 100644 --- a/ansible/inventory +++ b/ansible/inventory @@ -7,6 +7,7 @@ ath ber tpo bigv +fra [gh:children] wdc # technically it's i95.net, Radio Free Asia network, but GH has some boxes there @@ -48,6 +49,7 @@ ooni-explorer-next.test.ooni.io wiki.ooni.io labs.ooni.io hkgjump.ooni.nu +hkgsuperset.ooni.io [ams] explorer.ooni.io diff --git a/ansible/inventory-check.yml b/ansible/inventory-check.yml index 4313f607..e44552f2 100644 --- a/ansible/inventory-check.yml +++ b/ansible/inventory-check.yml @@ -2,6 +2,16 @@ --- - import_playbook: ansible-version.yml +- hosts: all + gather_facts: false + tasks: + - name: ensure that all inventory hosts are rooted to dom0 + assert: + that: + - groups.all | difference(groups.dom0) | length == 0 + msg: "Hosts in inventory not rooted to dom0: {{ groups.all | difference(groups.dom0) | sort | join(' ') }}" + run_once: true + - hosts: all vars: ansible_become: false # root is not required here, also it's not `become: false` as variable declraed for `all` has precedence over directive :-/ diff --git a/ansible/roles/prometheus/files/alert_rules.yml b/ansible/roles/prometheus/files/alert_rules.yml index b7a131b9..d9a84134 100644 --- a/ansible/roles/prometheus/files/alert_rules.yml +++ b/ansible/roles/prometheus/files/alert_rules.yml @@ -27,6 +27,14 @@ groups: annotations: summary: '{{ $labels.instance }} %iowait over 90%' + # the difference between node_disk_{io,read,write}_time_ms is not clear, `io` is NOT `read + write`, it may be greater, it may be less... + # All the nodes have `node_disk_io_time_ms`, but it can be verified with expr: (sum without(device) (node_disk_io_time_ms{job="node"}) or up{job="node"}) == 1 + - alert: IOHigh + expr: irate(node_disk_io_time_ms{device!~"(nbd[0-9]+|dm-[0-9]+|ram[0-9]+|sr[0-9]+|md[0-9]+)"}[1m]) > 800 + for: 2h + annotations: + summary: '{{ $labels.instance }}/{{ $labels.device }} spends {{ $value }}ms/s in IO over 2 hours' + - alert: CPUHigh expr: sum without (mode, cpu) (irate(node_cpu{mode!="idle"}[1m])) > 0.75 for: 8h