Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

inhibit: update inhibition cache when alerts resolve #1309

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 12 additions & 9 deletions inhibit/inhibit.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ package inhibit

import (
"context"
"fmt"
"sync"
"time"

Expand Down Expand Up @@ -81,14 +80,9 @@ func (ih *Inhibitor) run(ctx context.Context) {
level.Error(ih.logger).Log("msg", "Error iterating alerts", "err", err)
continue
}
if a.Resolved() {
// As alerts can also time out without an update, we never
// handle new resolved alerts but invalidate the cache on read.
continue
}
// Populate the inhibition rules' cache.
// Update the inhibition rules' cache.
for _, r := range ih.rules {
if r.SourceMatchers.Match(a.Labels) {
if r.exists(a) || r.SourceMatchers.Match(a.Labels) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is the purpose of r.exists(a)? It seems like from the code, if it exists, then it gets set (which doesn't accomplish anything, since it already exists?)

Running the tests without this code still passes.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC the only case where r.exists(a) would be false is when AlertManager processes a resolved alert that hasn't been seen as firing previously (eg after a restart). Not sure it is worth to keep it then.

r.set(a)
}
}
Expand Down Expand Up @@ -145,7 +139,7 @@ func (ih *Inhibitor) Mutes(lset model.LabelSet) bool {
for _, r := range ih.rules {
// Only inhibit if target matchers match but source matchers don't.
if inhibitedByFP, eq := r.hasEqual(lset); !r.SourceMatchers.Match(lset) && r.TargetMatchers.Match(lset) && eq {
ih.marker.SetInhibited(fp, fmt.Sprintf("%d", inhibitedByFP))
ih.marker.SetInhibited(fp, inhibitedByFP.String())
return true
}
}
Expand Down Expand Up @@ -217,6 +211,15 @@ func (r *InhibitRule) set(a *types.Alert) {
r.scache[a.Fingerprint()] = a
}

// exists returns true if the alert is present in the source cache.
func (r *InhibitRule) exists(a *types.Alert) bool {
r.mtx.Lock()
defer r.mtx.Unlock()

_, ok := r.scache[a.Fingerprint()]
return ok
}

// hasEqual checks whether the source cache contains alerts matching
// the equal labels for the given label set.
func (r *InhibitRule) hasEqual(lset model.LabelSet) (model.Fingerprint, bool) {
Expand Down
161 changes: 159 additions & 2 deletions inhibit/inhibit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,20 @@ import (
"testing"
"time"

"github.com/go-kit/kit/log"
"github.com/kylelemons/godebug/pretty"
"github.com/prometheus/common/model"

"github.com/prometheus/alertmanager/config"
"github.com/prometheus/alertmanager/provider"
"github.com/prometheus/alertmanager/types"
"github.com/prometheus/common/model"
)

var nopLogger = log.NewNopLogger()

func TestInhibitRuleHasEqual(t *testing.T) {
t.Parallel()

now := time.Now()
cases := []struct {
initial map[model.Fingerprint]*types.Alert
Expand Down Expand Up @@ -135,14 +142,16 @@ func TestInhibitRuleHasEqual(t *testing.T) {
}

func TestInhibitRuleMatches(t *testing.T) {
t.Parallel()

// Simple inhibut rule
cr := config.InhibitRule{
SourceMatch: map[string]string{"s": "1"},
TargetMatch: map[string]string{"t": "1"},
Equal: model.LabelNames{"e"},
}
m := types.NewMarker()
ih := NewInhibitor(nil, []*config.InhibitRule{&cr}, m, nil)
ih := NewInhibitor(nil, []*config.InhibitRule{&cr}, m, nopLogger)
ir := ih.rules[0]
now := time.Now()
// Active alert that matches the source filter
Expand Down Expand Up @@ -226,3 +235,151 @@ func TestInhibitRuleGC(t *testing.T) {
t.Errorf(pretty.Compare(r.scache, after))
}
}

type fakeAlerts struct {
alerts []*types.Alert
finished chan struct{}
}

func newFakeAlerts(alerts []*types.Alert) *fakeAlerts {
return &fakeAlerts{
alerts: alerts,
finished: make(chan struct{}),
}
}

func (f *fakeAlerts) GetPending() provider.AlertIterator { return nil }
func (f *fakeAlerts) Get(model.Fingerprint) (*types.Alert, error) { return nil, nil }
func (f *fakeAlerts) Put(...*types.Alert) error { return nil }
func (f *fakeAlerts) Subscribe() provider.AlertIterator {
ch := make(chan *types.Alert)
done := make(chan struct{})
go func() {
for _, a := range f.alerts {
ch <- a
}
// Send another (meaningless) alert to make sure that the inhibitor has
// processed everything.
ch <- &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{},
StartsAt: time.Now(),
},
}
close(f.finished)
<-done
}()
return provider.NewAlertIterator(ch, done, nil)
}

func TestInhibit(t *testing.T) {
t.Parallel()

now := time.Now()
inhibitRule := func() *config.InhibitRule {
return &config.InhibitRule{
SourceMatch: map[string]string{"s": "1"},
TargetMatch: map[string]string{"t": "1"},
Equal: model.LabelNames{"e"},
}
}
// alertOne is muted by alertTwo when it is active.
alertOne := func() *types.Alert {
return &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{"t": "1", "e": "f"},
StartsAt: now.Add(-time.Minute),
EndsAt: now.Add(time.Hour),
},
}
}
alertTwo := func(resolved bool) *types.Alert {
var end time.Time
if resolved {
end = now.Add(-time.Second)
} else {
end = now.Add(time.Hour)
}
return &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{"s": "1", "e": "f"},
StartsAt: now.Add(-time.Minute),
EndsAt: end,
},
}
}

type exp struct {
lbls model.LabelSet
muted bool
}
for i, tc := range []struct {
alerts []*types.Alert
expected []exp
}{
{
// alertOne shouldn't be muted since alertTwo hasn't fired.
alerts: []*types.Alert{alertOne()},
expected: []exp{
{
lbls: model.LabelSet{"t": "1", "e": "f"},
muted: false,
},
},
},
{
// alertOne should be muted by alertTwo which is active.
alerts: []*types.Alert{alertOne(), alertTwo(false)},
expected: []exp{
{
lbls: model.LabelSet{"t": "1", "e": "f"},
muted: true,
},
{
lbls: model.LabelSet{"s": "1", "e": "f"},
muted: false,
},
},
},
{
// alertOne shouldn't be muted since alertTwo is resolved.
alerts: []*types.Alert{alertOne(), alertTwo(false), alertTwo(true)},
expected: []exp{
{
lbls: model.LabelSet{"t": "1", "e": "f"},
muted: false,
},
{
lbls: model.LabelSet{"s": "1", "e": "f"},
muted: false,
},
},
},
} {
ap := newFakeAlerts(tc.alerts)
mk := types.NewMarker()
inhibitor := NewInhibitor(ap, []*config.InhibitRule{inhibitRule()}, mk, nopLogger)

go func() {
for ap.finished != nil {
select {
case <-ap.finished:
ap.finished = nil
default:
}
}
inhibitor.Stop()
}()
inhibitor.Run()

for _, expected := range tc.expected {
if inhibitor.Mutes(expected.lbls) != expected.muted {
mute := "unmuted"
if expected.muted {
mute = "muted"
}
t.Errorf("tc: %d, expected alert with labels %q to be %s", i, expected.lbls, mute)
}
}
}
}
72 changes: 72 additions & 0 deletions test/acceptance/inhibit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ import (
func TestInhibiting(t *testing.T) {
t.Parallel()

// This integration test checks that alerts can be inhibited and that an
// inhibited alert will be notified again as soon as the inhibiting alert
// gets resolved.

conf := `
route:
receiver: "default"
Expand Down Expand Up @@ -64,6 +68,10 @@ inhibit_rules:
// second batch of notifications.
am.Push(At(2.2), Alert("alertname", "JobDown", "job", "testjob", "zone", "aa"))

// InstanceDown in zone aa should fire again in the third batch of
// notifications once JobDown in zone aa gets resolved.
am.Push(At(3.6), Alert("alertname", "JobDown", "job", "testjob", "zone", "aa").Active(2.2, 3.6))

co.Want(Between(2, 2.5),
Alert("alertname", "test1", "job", "testjob", "zone", "aa").Active(1),
Alert("alertname", "InstanceDown", "job", "testjob", "zone", "aa").Active(1),
Expand All @@ -76,5 +84,69 @@ inhibit_rules:
Alert("alertname", "JobDown", "job", "testjob", "zone", "aa").Active(2.2),
)

co.Want(Between(4, 4.5),
Alert("alertname", "test1", "job", "testjob", "zone", "aa").Active(1),
Alert("alertname", "InstanceDown", "job", "testjob", "zone", "aa").Active(1),
Alert("alertname", "InstanceDown", "job", "testjob", "zone", "ab").Active(1),
Alert("alertname", "JobDown", "job", "testjob", "zone", "aa").Active(2.2, 3.6),
)

at.Run()
}

func TestAlwaysInhibiting(t *testing.T) {
t.Parallel()

// This integration test checks that when inhibited and inhibiting alerts
// gets resolved at the same time, the final notification contains both
// alerts.

conf := `
route:
receiver: "default"
group_by: []
group_wait: 1s
group_interval: 1s
repeat_interval: 1s

receivers:
- name: "default"
webhook_configs:
- url: 'http://%s'

inhibit_rules:
- source_match:
alertname: JobDown
target_match:
alertname: InstanceDown
equal:
- job
- zone
`

at := NewAcceptanceTest(t, &AcceptanceOpts{
Tolerance: 150 * time.Millisecond,
})

co := at.Collector("webhook")
wh := NewWebhook(co)

am := at.Alertmanager(fmt.Sprintf(conf, wh.Address()))

am.Push(At(1), Alert("alertname", "InstanceDown", "job", "testjob", "zone", "aa"))
am.Push(At(1), Alert("alertname", "JobDown", "job", "testjob", "zone", "aa"))

am.Push(At(2.6), Alert("alertname", "JobDown", "job", "testjob", "zone", "aa").Active(1, 2.6))
am.Push(At(2.6), Alert("alertname", "InstanceDown", "job", "testjob", "zone", "aa").Active(1, 2.6))

co.Want(Between(2, 2.5),
Alert("alertname", "JobDown", "job", "testjob", "zone", "aa").Active(1),
)

co.Want(Between(3, 3.5),
Alert("alertname", "InstanceDown", "job", "testjob", "zone", "aa").Active(1, 2.6),
Alert("alertname", "JobDown", "job", "testjob", "zone", "aa").Active(1, 2.6),
)

at.Run()
}