Skip to content

Commit

Permalink
Fix resolved alerts still inhibiting (prometheus#1331)
Browse files Browse the repository at this point in the history
* inhibit: update inhibition cache when alerts resolve

Signed-off-by: Simon Pasquier <[email protected]>

* inhibit: remove unnecessary fmt.Sprintf

Signed-off-by: Simon Pasquier <[email protected]>

* inhibit: add unit tests

Signed-off-by: Simon Pasquier <[email protected]>

* inhibit: use NopLogger in tests

Signed-off-by: Simon Pasquier <[email protected]>

* Update old alert with result of merge with new

On ingest, alerts with matching fingerprints are
merged if the new alert's start and end times
overlap with the old alert's.

The merge creates a new alert, which is then
updated in the internal alert store.

The original alert is not updated (because merge
creates a copy), so it is never marked as resolved
in the inhibitor's reference to it.

The code within the inhibitor relies on skipping
over resolved alerts, but because the old alert is
never updated it is never marked as resolved. Thus
it continues to inhibit other alerts until it is
cleaned up by the internal GC.

This commit updates the struct of the old alert
with the result of the merge with the new alert.

An alternative would be to always update the
inhibitor's internal cache of alerts regardless of
an alert's resolve status.

Signed-off-by: stuart nelson <[email protected]>

* Update inhibitor cache even if alert is resolved

This seems like a better choice than the previous
commit. I think it is more sane to have the
inhibitor update its own cache, rather than having
one of its pointers updated externally.

Signed-off-by: stuart nelson <[email protected]>
  • Loading branch information
stuartnelson3 authored and mxinden committed May 5, 2018
1 parent 9806beb commit c6af6a6
Show file tree
Hide file tree
Showing 3 changed files with 233 additions and 10 deletions.
10 changes: 2 additions & 8 deletions inhibit/inhibit.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ package inhibit

import (
"context"
"fmt"
"sync"
"time"

Expand Down Expand Up @@ -81,12 +80,7 @@ func (ih *Inhibitor) run(ctx context.Context) {
level.Error(ih.logger).Log("msg", "Error iterating alerts", "err", err)
continue
}
if a.Resolved() {
// As alerts can also time out without an update, we never
// handle new resolved alerts but invalidate the cache on read.
continue
}
// Populate the inhibition rules' cache.
// Update the inhibition rules' cache.
for _, r := range ih.rules {
if r.SourceMatchers.Match(a.Labels) {
r.set(a)
Expand Down Expand Up @@ -145,7 +139,7 @@ func (ih *Inhibitor) Mutes(lset model.LabelSet) bool {
for _, r := range ih.rules {
// Only inhibit if target matchers match but source matchers don't.
if inhibitedByFP, eq := r.hasEqual(lset); !r.SourceMatchers.Match(lset) && r.TargetMatchers.Match(lset) && eq {
ih.marker.SetInhibited(fp, fmt.Sprintf("%d", inhibitedByFP))
ih.marker.SetInhibited(fp, inhibitedByFP.String())
return true
}
}
Expand Down
161 changes: 159 additions & 2 deletions inhibit/inhibit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,20 @@ import (
"testing"
"time"

"github.com/go-kit/kit/log"
"github.com/kylelemons/godebug/pretty"
"github.com/prometheus/common/model"

"github.com/prometheus/alertmanager/config"
"github.com/prometheus/alertmanager/provider"
"github.com/prometheus/alertmanager/types"
"github.com/prometheus/common/model"
)

var nopLogger = log.NewNopLogger()

func TestInhibitRuleHasEqual(t *testing.T) {
t.Parallel()

now := time.Now()
cases := []struct {
initial map[model.Fingerprint]*types.Alert
Expand Down Expand Up @@ -135,14 +142,16 @@ func TestInhibitRuleHasEqual(t *testing.T) {
}

func TestInhibitRuleMatches(t *testing.T) {
t.Parallel()

// Simple inhibut rule
cr := config.InhibitRule{
SourceMatch: map[string]string{"s": "1"},
TargetMatch: map[string]string{"t": "1"},
Equal: model.LabelNames{"e"},
}
m := types.NewMarker()
ih := NewInhibitor(nil, []*config.InhibitRule{&cr}, m, nil)
ih := NewInhibitor(nil, []*config.InhibitRule{&cr}, m, nopLogger)
ir := ih.rules[0]
now := time.Now()
// Active alert that matches the source filter
Expand Down Expand Up @@ -226,3 +235,151 @@ func TestInhibitRuleGC(t *testing.T) {
t.Errorf(pretty.Compare(r.scache, after))
}
}

type fakeAlerts struct {
alerts []*types.Alert
finished chan struct{}
}

func newFakeAlerts(alerts []*types.Alert) *fakeAlerts {
return &fakeAlerts{
alerts: alerts,
finished: make(chan struct{}),
}
}

func (f *fakeAlerts) GetPending() provider.AlertIterator { return nil }
func (f *fakeAlerts) Get(model.Fingerprint) (*types.Alert, error) { return nil, nil }
func (f *fakeAlerts) Put(...*types.Alert) error { return nil }
func (f *fakeAlerts) Subscribe() provider.AlertIterator {
ch := make(chan *types.Alert)
done := make(chan struct{})
go func() {
for _, a := range f.alerts {
ch <- a
}
// Send another (meaningless) alert to make sure that the inhibitor has
// processed everything.
ch <- &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{},
StartsAt: time.Now(),
},
}
close(f.finished)
<-done
}()
return provider.NewAlertIterator(ch, done, nil)
}

func TestInhibit(t *testing.T) {
t.Parallel()

now := time.Now()
inhibitRule := func() *config.InhibitRule {
return &config.InhibitRule{
SourceMatch: map[string]string{"s": "1"},
TargetMatch: map[string]string{"t": "1"},
Equal: model.LabelNames{"e"},
}
}
// alertOne is muted by alertTwo when it is active.
alertOne := func() *types.Alert {
return &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{"t": "1", "e": "f"},
StartsAt: now.Add(-time.Minute),
EndsAt: now.Add(time.Hour),
},
}
}
alertTwo := func(resolved bool) *types.Alert {
var end time.Time
if resolved {
end = now.Add(-time.Second)
} else {
end = now.Add(time.Hour)
}
return &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{"s": "1", "e": "f"},
StartsAt: now.Add(-time.Minute),
EndsAt: end,
},
}
}

type exp struct {
lbls model.LabelSet
muted bool
}
for i, tc := range []struct {
alerts []*types.Alert
expected []exp
}{
{
// alertOne shouldn't be muted since alertTwo hasn't fired.
alerts: []*types.Alert{alertOne()},
expected: []exp{
{
lbls: model.LabelSet{"t": "1", "e": "f"},
muted: false,
},
},
},
{
// alertOne should be muted by alertTwo which is active.
alerts: []*types.Alert{alertOne(), alertTwo(false)},
expected: []exp{
{
lbls: model.LabelSet{"t": "1", "e": "f"},
muted: true,
},
{
lbls: model.LabelSet{"s": "1", "e": "f"},
muted: false,
},
},
},
{
// alertOne shouldn't be muted since alertTwo is resolved.
alerts: []*types.Alert{alertOne(), alertTwo(false), alertTwo(true)},
expected: []exp{
{
lbls: model.LabelSet{"t": "1", "e": "f"},
muted: false,
},
{
lbls: model.LabelSet{"s": "1", "e": "f"},
muted: false,
},
},
},
} {
ap := newFakeAlerts(tc.alerts)
mk := types.NewMarker()
inhibitor := NewInhibitor(ap, []*config.InhibitRule{inhibitRule()}, mk, nopLogger)

go func() {
for ap.finished != nil {
select {
case <-ap.finished:
ap.finished = nil
default:
}
}
inhibitor.Stop()
}()
inhibitor.Run()

for _, expected := range tc.expected {
if inhibitor.Mutes(expected.lbls) != expected.muted {
mute := "unmuted"
if expected.muted {
mute = "muted"
}
t.Errorf("tc: %d, expected alert with labels %q to be %s", i, expected.lbls, mute)
}
}
}
}
72 changes: 72 additions & 0 deletions test/acceptance/inhibit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ import (
func TestInhibiting(t *testing.T) {
t.Parallel()

// This integration test checks that alerts can be inhibited and that an
// inhibited alert will be notified again as soon as the inhibiting alert
// gets resolved.

conf := `
route:
receiver: "default"
Expand Down Expand Up @@ -64,6 +68,10 @@ inhibit_rules:
// second batch of notifications.
am.Push(At(2.2), Alert("alertname", "JobDown", "job", "testjob", "zone", "aa"))

// InstanceDown in zone aa should fire again in the third batch of
// notifications once JobDown in zone aa gets resolved.
am.Push(At(3.6), Alert("alertname", "JobDown", "job", "testjob", "zone", "aa").Active(2.2, 3.6))

co.Want(Between(2, 2.5),
Alert("alertname", "test1", "job", "testjob", "zone", "aa").Active(1),
Alert("alertname", "InstanceDown", "job", "testjob", "zone", "aa").Active(1),
Expand All @@ -76,5 +84,69 @@ inhibit_rules:
Alert("alertname", "JobDown", "job", "testjob", "zone", "aa").Active(2.2),
)

co.Want(Between(4, 4.5),
Alert("alertname", "test1", "job", "testjob", "zone", "aa").Active(1),
Alert("alertname", "InstanceDown", "job", "testjob", "zone", "aa").Active(1),
Alert("alertname", "InstanceDown", "job", "testjob", "zone", "ab").Active(1),
Alert("alertname", "JobDown", "job", "testjob", "zone", "aa").Active(2.2, 3.6),
)

at.Run()
}

func TestAlwaysInhibiting(t *testing.T) {
t.Parallel()

// This integration test checks that when inhibited and inhibiting alerts
// gets resolved at the same time, the final notification contains both
// alerts.

conf := `
route:
receiver: "default"
group_by: []
group_wait: 1s
group_interval: 1s
repeat_interval: 1s
receivers:
- name: "default"
webhook_configs:
- url: 'http://%s'
inhibit_rules:
- source_match:
alertname: JobDown
target_match:
alertname: InstanceDown
equal:
- job
- zone
`

at := NewAcceptanceTest(t, &AcceptanceOpts{
Tolerance: 150 * time.Millisecond,
})

co := at.Collector("webhook")
wh := NewWebhook(co)

am := at.Alertmanager(fmt.Sprintf(conf, wh.Address()))

am.Push(At(1), Alert("alertname", "InstanceDown", "job", "testjob", "zone", "aa"))
am.Push(At(1), Alert("alertname", "JobDown", "job", "testjob", "zone", "aa"))

am.Push(At(2.6), Alert("alertname", "JobDown", "job", "testjob", "zone", "aa").Active(1, 2.6))
am.Push(At(2.6), Alert("alertname", "InstanceDown", "job", "testjob", "zone", "aa").Active(1, 2.6))

co.Want(Between(2, 2.5),
Alert("alertname", "JobDown", "job", "testjob", "zone", "aa").Active(1),
)

co.Want(Between(3, 3.5),
Alert("alertname", "InstanceDown", "job", "testjob", "zone", "aa").Active(1, 2.6),
Alert("alertname", "JobDown", "job", "testjob", "zone", "aa").Active(1, 2.6),
)

at.Run()
}

0 comments on commit c6af6a6

Please sign in to comment.