Skip to content

Commit

Permalink
metrics: refine alert rules (#3021) (#3023)
Browse files Browse the repository at this point in the history
  • Loading branch information
ti-chi-bot authored Oct 13, 2021
1 parent e7bb22c commit 44526df
Showing 1 changed file with 7 additions and 43 deletions.
50 changes: 7 additions & 43 deletions metrics/alertmanager/ticdc.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,30 +37,6 @@ groups:
value: '{{ $value }}'
summary: cdc processor resolved ts delay more than 5 minutes

- alert: ticdc_puller_entry_sorter_sort_duration_time_more_than_2s
expr: histogram_quantile(0.9, rate(ticdc_puller_entry_sorter_sort_bucket[1m])) > 2
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: histogram_quantile(0.9, rate(ticdc_puller_entry_sorter_sort_bucket[1m])) > 2
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: ticdc_puller_entry_sorter sort duration time more than 2s

- alert: ticdc_puller_entry_sorter_merge_duration_time_more_than_2s
expr: histogram_quantile(0.9, rate(ticdc_puller_entry_sorter_merge_bucket[1m])) > 2
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: histogram_quantile(0.9, rate(ticdc_puller_entry_sorter_merge_bucket[1m])) > 2
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: ticdc_puller_entry_sorter merge duration time more than 2s

- alert: ticdc_mounter_unmarshal_and_mount_time_more_than_1s
expr: histogram_quantile(0.9, rate(ticdc_mounter_unmarshal_and_mount_bucket[1m])) * 1000 > 1000
for: 1m
Expand Down Expand Up @@ -121,28 +97,28 @@ groups:
summary: ticdc puller entry sorter merge latency is too high

- alert: tikv_cdc_min_resolved_ts_no_change_for_1m
expr: changes(tikv_cdc_min_resolved_ts[1m]) < 1
expr: changes(tikv_cdc_min_resolved_ts[1m]) < 1 and ON (instance) tikv_cdc_region_resolve_status{status="resolved"} > 0
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: changes(tikv_cdc_min_resolved_ts[1m]) < 1
expr: changes(tikv_cdc_min_resolved_ts[1m]) < 1 and ON (instance) tikv_cdc_region_resolve_status{status="resolved"} > 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
value: '{{ $labels.instance }}'
summary: tikv cdc min resolved ts no change for 1m

- alert: tikv_cdc_scan_duration_seconds_more_than_30s
expr: histogram_quantile(0.9, rate(tikv_cdc_scan_duration_seconds_bucket{}[1m])) > 30
- alert: tikv_cdc_scan_duration_seconds_more_than_10min
expr: histogram_quantile(0.9, rate(tikv_cdc_scan_duration_seconds_bucket{}[1m])) > 600
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: histogram_quantile(0.9, rate(tikv_cdc_scan_duration_seconds_bucket{}[1m])) > 30
expr: histogram_quantile(0.9, rate(tikv_cdc_scan_duration_seconds_bucket{}[1m])) > 600
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: tikv cdc scan duration seconds more than 30s
summary: tikv cdc scan duration seconds more than 10 min

- alert: ticdc_sink_mysql_execution_error
expr: changes(ticdc_sink_mysql_execution_error[1m]) > 0
Expand Down Expand Up @@ -179,15 +155,3 @@ groups:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiCDC heap memory usage is over 10 GB

- alert: tikv_enabled_hibernate_regions
expr: sum(tikv_config_raftstore{name="hibernate_regions"}) > 0
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: sum(tikv_config_raftstore{name="hibernate_regions"}) > 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: cdc will break tikv hibernate regions

0 comments on commit 44526df

Please sign in to comment.