Skip to content

Commit

Permalink
fix(validators): observability
Browse files Browse the repository at this point in the history
  • Loading branch information
auricom committed Jan 24, 2025
1 parent 5c98428 commit b30bfd3
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 70 deletions.
2 changes: 1 addition & 1 deletion ansible_collections/validators/galaxy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ namespace: binarybuilders
name: validators

# The version of the collection. Must be compatible with semantic versioning
version: 1.6.0
version: 1.6.1

# The path to the Markdown (.md) readme file. This path is relative to the root of the collection
readme: README.md
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@
mode: "755"
remote_src: true
src: /tmp/pd-x86_64-unknown-linux-gnu/{{ chain_penumbra_appd_name }}
notify:
- Restart appd-{{ chain_penumbra_chain_id }}
- Restart cometbft-{{ chain_penumbra_chain_id }}

- name: Appd | Create systemd service file
ansible.builtin.copy:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ groups:
expr: tendermint_p2p_peers{chain_id="{{ observability_chain_id }}"} < 2 or cometbft_p2p_peers{chain_id="{{ observability_chain_id }}"} < 2
for: 5m
labels:
severity: warning
severity: major
service: validator
component: cosmos-sdk
annotations:
Expand All @@ -29,7 +29,7 @@ groups:
expr: increase(tendermint_consensus_latest_block_height{chain_id="{{ observability_chain_id }}"}[5m]) < 10 or increase(cometbft_consensus_latest_block_height{chain_id="{{ observability_chain_id }}"}[5m]) < 10
for: 5m
labels:
severity: warning
severity: major
service: validator
componnent: cosmos-sdk
annotations:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ groups:
expr: increase(cosmos_validator_watcher_solo_missed_blocks{chain_id="{{ observability_chain_id }}"}[3m]) > 10
for: 3m
labels:
severity: warning
severity: major
service: validator
component: cosmos-validator-watcher
annotations:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,55 +12,48 @@ groups:
annotations:
description: 'Filesystem "{{ $labels.mountpoint }}" on "{{ $labels.instance }}" is read-only.'

- alert: DiskFullIn1Week
- alert: LowFilesystemSpace
expr: |
predict_linear(
(
node_filesystem_avail_bytes{
fstype!~"rootfs|nfs4|tmpfs",
instance=~".*validator.*"
}[24h]
),
7 * 24 * 3600
) < 0
100 * (
node_filesystem_avail_bytes{fstype!~"rootfs|nfs4|tmpfs",instance=~".*validator.*"}
/
node_filesystem_size_bytes
) < 15
for: 15m
labels:
severity: critical
component: node_exporter
annotations:
description: |
Filesystem on {{ $labels.mountpoint }} has {{ printf "%.2f" $value }}% free space
(instance: {{ $labels.instance }})
summary: "Low disk space (below 15%)"

- alert: DiskFull14D
expr: |
predict_linear(node_filesystem_avail_bytes{fstype!~"rootfs|nfs4|tmpfs",instance=~".*validator.*"}[14d], 14 * 86400) < 0
and
rate(
node_filesystem_avail_bytes{
fstype!~"rootfs|nfs4|tmpfs",
instance=~".*validator.*"
}[24h]
) < 0
for: 5m
(time() - node_filesystem_avail_bytes_created{instance=~".*validator.*"}) > 1209600 # 14-day minimum age
for: 24h
keep_firing_for: 12h
labels:
severity: warning
component: node_exporter
annotations:
description: 'Filesystem "{{ $labels.mountpoint }}" on "{{ $labels.instance }}" will be out of diskspace within 1 week based on current growth rate.'
description: 'Filesystem "{{ $labels.mountpoint }}" on "{{ $labels.instance }}" will be out of diskspace within 2 weeks based on current growth rate.'

- alert: DiskFull12H
- alert: DiskFull72H
expr: |
predict_linear(
(
node_filesystem_avail_bytes{
fstype!~"rootfs|nfs4|tmpfs",
instance=~".*validator.*"
}[6h]
),
12 * 3600
) < 0
predict_linear(node_filesystem_avail_bytes{fstype!~"rootfs|nfs4|tmpfs",instance=~".*validator.*"}[7d],72*3600) < 0
and
rate(
node_filesystem_avail_bytes{
fstype!~"rootfs|nfs4|tmpfs",
instance=~".*validator.*"
}[6h]
) < 0
for: 5m
(time() - node_filesystem_avail_bytes_created{fstype!~"rootfs|nfs4|tmpfs",instance=~".*validator.*"}[7d]) > 604800 # Ignore new filesystems
for: 2h
keep_firing_for: 1h
labels:
severity: critical
component: node_exporter
annotations:
description: 'Filesystem "{{ $labels.mountpoint }}" on "{{ $labels.instance }}" will be out of diskspace within 12 hours based on current growth rate.'
description: 'Filesystem "{{ $labels.mountpoint }}" on "{{ $labels.instance }}" will be out of diskspace within 72 hours based on current growth rate.'

- alert: NodeDiskFull
expr: node_filesystem_avail_bytes/node_filesystem_size_bytes < 0.01
Expand All @@ -71,55 +64,43 @@ groups:
annotations:
description: 'Filesystem "{{ $labels.mountpoint }}" on "{{ $labels.instance }}" is out of diskspace (< 1% free).'

- alert: InodesFreeExhaustionIn1Week
- alert: InodeExhaustion72H
expr: |
predict_linear(
(
node_filesystem_files_free{
fstype!~"rootfs|nfs4|tmpfs",
instance=~".*validator.*"
}[24h]
),
7 * 24 * 3600
) < 0
and
rate(
node_filesystem_files_free{
fstype!~"rootfs|nfs4|tmpfs",
instance=~".*validator.*"
}[24h]
}[7d],
72 * 3600
) < 0
for: 5m
and
(time() - node_filesystem_files_created{instance=~".*validator.*"}) > 604800
for: 6h
keep_firing_for: 2h
labels:
severity: warning
severity: critical
component: node_exporter
annotations:
description: 'Filesystem "{{ $labels.mountpoint }}" on "{{ $labels.instance }}" will be out of inodes within 1 week based on current growth rate.'
description: 'Filesystem "{{ $labels.mountpoint }}" on "{{ $labels.instance }}" will be out of inodes within 2 weeks based on current growth rate.'

- alert: InodesFreeExhaustionIn12Hours
- alert: InodeExhaustion14D
expr: |
predict_linear(
(
node_filesystem_files_free{
fstype!~"rootfs|nfs4|tmpfs",
instance=~".*validator.*"
}[6h]
),
12 * 3600
) < 0
and
rate(
node_filesystem_files_free{
fstype!~"rootfs|nfs4|tmpfs",
instance=~".*validator.*"
}[6h]
}[14d],
14 * 86400
) < 0
for: 5m
and
(time() - node_filesystem_files_created{instance=~".*validator.*"}) > 1209600
for: 24h
keep_firing_for: 12h
labels:
severity: critical
severity: warning
component: node_exporter
annotations:
description: 'Filesystem "{{ $labels.mountpoint }}" on "{{ $labels.instance }}" will be out of inode numbers within 12 hours.'
description: 'Filesystem "{{ $labels.mountpoint }}" on "{{ $labels.instance }}" will be out of inode numbers within 72 hours.'

- alert: InodeFull
expr: node_filesystem_files_free/node_filesystem_files{instance=~".*validator.*"} < 0.01
Expand Down

0 comments on commit b30bfd3

Please sign in to comment.