Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix resolved alerts still inhibiting #1331

Merged
merged 6 commits into from
Apr 18, 2018
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 2 additions & 5 deletions inhibit/inhibit.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ package inhibit

import (
"context"
"fmt"
"sync"
"time"

Expand Down Expand Up @@ -82,11 +81,9 @@ func (ih *Inhibitor) run(ctx context.Context) {
continue
}
if a.Resolved() {
// As alerts can also time out without an update, we never
// handle new resolved alerts but invalidate the cache on read.
continue
}
// Populate the inhibition rules' cache.
// Update the inhibition rules' cache.
for _, r := range ih.rules {
if r.SourceMatchers.Match(a.Labels) {
r.set(a)
Expand Down Expand Up @@ -145,7 +142,7 @@ func (ih *Inhibitor) Mutes(lset model.LabelSet) bool {
for _, r := range ih.rules {
// Only inhibit if target matchers match but source matchers don't.
if inhibitedByFP, eq := r.hasEqual(lset); !r.SourceMatchers.Match(lset) && r.TargetMatchers.Match(lset) && eq {
ih.marker.SetInhibited(fp, fmt.Sprintf("%d", inhibitedByFP))
ih.marker.SetInhibited(fp, inhibitedByFP.String())
return true
}
}
Expand Down
161 changes: 159 additions & 2 deletions inhibit/inhibit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,20 @@ import (
"testing"
"time"

"github.com/go-kit/kit/log"
"github.com/kylelemons/godebug/pretty"
"github.com/prometheus/common/model"

"github.com/prometheus/alertmanager/config"
"github.com/prometheus/alertmanager/provider"
"github.com/prometheus/alertmanager/types"
"github.com/prometheus/common/model"
)

var nopLogger = log.NewNopLogger()

func TestInhibitRuleHasEqual(t *testing.T) {
t.Parallel()

now := time.Now()
cases := []struct {
initial map[model.Fingerprint]*types.Alert
Expand Down Expand Up @@ -135,14 +142,16 @@ func TestInhibitRuleHasEqual(t *testing.T) {
}

func TestInhibitRuleMatches(t *testing.T) {
t.Parallel()

// Simple inhibut rule
cr := config.InhibitRule{
SourceMatch: map[string]string{"s": "1"},
TargetMatch: map[string]string{"t": "1"},
Equal: model.LabelNames{"e"},
}
m := types.NewMarker()
ih := NewInhibitor(nil, []*config.InhibitRule{&cr}, m, nil)
ih := NewInhibitor(nil, []*config.InhibitRule{&cr}, m, nopLogger)
ir := ih.rules[0]
now := time.Now()
// Active alert that matches the source filter
Expand Down Expand Up @@ -226,3 +235,151 @@ func TestInhibitRuleGC(t *testing.T) {
t.Errorf(pretty.Compare(r.scache, after))
}
}

type fakeAlerts struct {
alerts []*types.Alert
finished chan struct{}
}

func newFakeAlerts(alerts []*types.Alert) *fakeAlerts {
return &fakeAlerts{
alerts: alerts,
finished: make(chan struct{}),
}
}

func (f *fakeAlerts) GetPending() provider.AlertIterator { return nil }
func (f *fakeAlerts) Get(model.Fingerprint) (*types.Alert, error) { return nil, nil }
func (f *fakeAlerts) Put(...*types.Alert) error { return nil }
func (f *fakeAlerts) Subscribe() provider.AlertIterator {
ch := make(chan *types.Alert)
done := make(chan struct{})
go func() {
for _, a := range f.alerts {
ch <- a
}
// Send another (meaningless) alert to make sure that the inhibitor has
// processed everything.
ch <- &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{},
StartsAt: time.Now(),
},
}
close(f.finished)
<-done
}()
return provider.NewAlertIterator(ch, done, nil)
}

func TestInhibit(t *testing.T) {
t.Parallel()

now := time.Now()
inhibitRule := func() *config.InhibitRule {
return &config.InhibitRule{
SourceMatch: map[string]string{"s": "1"},
TargetMatch: map[string]string{"t": "1"},
Equal: model.LabelNames{"e"},
}
}
// alertOne is muted by alertTwo when it is active.
alertOne := func() *types.Alert {
return &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{"t": "1", "e": "f"},
StartsAt: now.Add(-time.Minute),
EndsAt: now.Add(time.Hour),
},
}
}
alertTwo := func(resolved bool) *types.Alert {
var end time.Time
if resolved {
end = now.Add(-time.Second)
} else {
end = now.Add(time.Hour)
}
return &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{"s": "1", "e": "f"},
StartsAt: now.Add(-time.Minute),
EndsAt: end,
},
}
}

type exp struct {
lbls model.LabelSet
muted bool
}
for i, tc := range []struct {
alerts []*types.Alert
expected []exp
}{
{
// alertOne shouldn't be muted since alertTwo hasn't fired.
alerts: []*types.Alert{alertOne()},
expected: []exp{
{
lbls: model.LabelSet{"t": "1", "e": "f"},
muted: false,
},
},
},
{
// alertOne should be muted by alertTwo which is active.
alerts: []*types.Alert{alertOne(), alertTwo(false)},
expected: []exp{
{
lbls: model.LabelSet{"t": "1", "e": "f"},
muted: true,
},
{
lbls: model.LabelSet{"s": "1", "e": "f"},
muted: false,
},
},
},
{
// alertOne shouldn't be muted since alertTwo is resolved.
alerts: []*types.Alert{alertOne(), alertTwo(false), alertTwo(true)},
expected: []exp{
{
lbls: model.LabelSet{"t": "1", "e": "f"},
muted: false,
},
{
lbls: model.LabelSet{"s": "1", "e": "f"},
muted: false,
},
},
},
} {
ap := newFakeAlerts(tc.alerts)
mk := types.NewMarker()
inhibitor := NewInhibitor(ap, []*config.InhibitRule{inhibitRule()}, mk, nopLogger)

go func() {
for ap.finished != nil {
select {
case <-ap.finished:
ap.finished = nil
default:
}
}
inhibitor.Stop()
}()
inhibitor.Run()

for _, expected := range tc.expected {
if inhibitor.Mutes(expected.lbls) != expected.muted {
mute := "unmuted"
if expected.muted {
mute = "muted"
}
t.Errorf("tc: %d, expected alert with labels %q to be %s", i, expected.lbls, mute)
}
}
}
}
6 changes: 6 additions & 0 deletions provider/mem/mem.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,12 @@ func (a *Alerts) Put(alerts ...*types.Alert) error {
if (alert.EndsAt.After(old.StartsAt) && alert.EndsAt.Before(old.EndsAt)) ||
(alert.StartsAt.After(old.StartsAt) && alert.StartsAt.Before(old.EndsAt)) {
alert = old.Merge(alert)
// Merge returns a new alert. In order to
// update old, we have to set the struct it
// points to to equal the newly merged alert.
// This is necessary as old may be stored in
// the inhibitor's rules cache.
*old = *alert
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As mentioned in the description, this line could be removed if the inhibitor's internal scache is updated even if an alert is resolved.

I think I would prefer that, as opposed to updating a pointer's value and hoping that it's being referred to elsewhere.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Had a chat with @grobie and he agreed. The latest commit reflects these changes.

}
}

Expand Down
72 changes: 72 additions & 0 deletions test/acceptance/inhibit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ import (
func TestInhibiting(t *testing.T) {
t.Parallel()

// This integration test checks that alerts can be inhibited and that an
// inhibited alert will be notified again as soon as the inhibiting alert
// gets resolved.

conf := `
route:
receiver: "default"
Expand Down Expand Up @@ -64,6 +68,10 @@ inhibit_rules:
// second batch of notifications.
am.Push(At(2.2), Alert("alertname", "JobDown", "job", "testjob", "zone", "aa"))

// InstanceDown in zone aa should fire again in the third batch of
// notifications once JobDown in zone aa gets resolved.
am.Push(At(3.6), Alert("alertname", "JobDown", "job", "testjob", "zone", "aa").Active(2.2, 3.6))

co.Want(Between(2, 2.5),
Alert("alertname", "test1", "job", "testjob", "zone", "aa").Active(1),
Alert("alertname", "InstanceDown", "job", "testjob", "zone", "aa").Active(1),
Expand All @@ -76,5 +84,69 @@ inhibit_rules:
Alert("alertname", "JobDown", "job", "testjob", "zone", "aa").Active(2.2),
)

co.Want(Between(4, 4.5),
Alert("alertname", "test1", "job", "testjob", "zone", "aa").Active(1),
Alert("alertname", "InstanceDown", "job", "testjob", "zone", "aa").Active(1),
Alert("alertname", "InstanceDown", "job", "testjob", "zone", "ab").Active(1),
Alert("alertname", "JobDown", "job", "testjob", "zone", "aa").Active(2.2, 3.6),
)

at.Run()
}

func TestAlwaysInhibiting(t *testing.T) {
t.Parallel()

// This integration test checks that when inhibited and inhibiting alerts
// gets resolved at the same time, the final notification contains both
// alerts.

conf := `
route:
receiver: "default"
group_by: []
group_wait: 1s
group_interval: 1s
repeat_interval: 1s

receivers:
- name: "default"
webhook_configs:
- url: 'http://%s'

inhibit_rules:
- source_match:
alertname: JobDown
target_match:
alertname: InstanceDown
equal:
- job
- zone
`

at := NewAcceptanceTest(t, &AcceptanceOpts{
Tolerance: 150 * time.Millisecond,
})

co := at.Collector("webhook")
wh := NewWebhook(co)

am := at.Alertmanager(fmt.Sprintf(conf, wh.Address()))

am.Push(At(1), Alert("alertname", "InstanceDown", "job", "testjob", "zone", "aa"))
am.Push(At(1), Alert("alertname", "JobDown", "job", "testjob", "zone", "aa"))

am.Push(At(2.6), Alert("alertname", "JobDown", "job", "testjob", "zone", "aa").Active(1, 2.6))
am.Push(At(2.6), Alert("alertname", "InstanceDown", "job", "testjob", "zone", "aa").Active(1, 2.6))

co.Want(Between(2, 2.5),
Alert("alertname", "JobDown", "job", "testjob", "zone", "aa").Active(1),
)

co.Want(Between(3, 3.5),
Alert("alertname", "InstanceDown", "job", "testjob", "zone", "aa").Active(1, 2.6),
Alert("alertname", "JobDown", "job", "testjob", "zone", "aa").Active(1, 2.6),
)

at.Run()
}