Skip to content

Commit

Permalink
[Heartbeat] Fix incorrect 'Up' status for all mode (#11895) (#12007)
Browse files Browse the repository at this point in the history
This fixes #11737 , where setting mode: all now spawns multiple sub-tasks, but the parent task still runs as a job, even though it tests very little, and will always succeed so long as the DNS query does. It is essentially a parent job testing that the DNS resolver works.

From a certain strict POV the behavior before this patch is correct. We executed a discrete job (checking DNS) and it worked successfully. A second part of the job, actually hitting the endpoints, failed. The total job (as noted in the summary) did fail, but a sub-part did succeed. That said, this is too complex and a bad UX, so this patch makes sense. Maybe in the future we will be under different constraints.

The fix here involved marking that job's emitted event as cancelled using job metadata. I'd discussed using event.private for that with @ruflin, but looking at the code metadata seemed more appropriate. I'd love to hear your $0.02 @ruflin if you think private is more appropriate. Metricbeat wraps all events in its own metadata struct, but I don't think we're there yet in heartbeat in terms of a major refactor being justified.

Testing this manually is easy, just point at a domain with multiple A records, like elastic.co. Truly integration testing it programmatically is hard without setting up a DNS resolver. The compromise I reached here is unit tests for the individual bits of logic.

Fixes #11737

(cherry picked from commit 078612e)
  • Loading branch information
andrewvc authored May 2, 2019
1 parent b0d3a07 commit ca315ef
Show file tree
Hide file tree
Showing 6 changed files with 267 additions and 50 deletions.
26 changes: 24 additions & 2 deletions heartbeat/eventext/eventext.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,30 @@ import (
// MergeEventFields merges the given common.MapStr into the given Event's Fields.
func MergeEventFields(e *beat.Event, merge common.MapStr) {
if e.Fields != nil {
e.Fields.DeepUpdate(merge)
e.Fields.DeepUpdate(merge.Clone())
} else {
e.Fields = merge
e.Fields = merge.Clone()
}
}

// EventCancelledMetaKey is the path to the @metadata key marking an event as cancelled.
const EventCancelledMetaKey = "__hb_evt_cancel__"

// CancelEvent marks the event as cancelled. Downstream consumers of it should not emit nor output this event.
func CancelEvent(event *beat.Event) {
if event != nil {
if event.Meta == nil {
event.Meta = common.MapStr{}
}
event.Meta.Put(EventCancelledMetaKey, true)
}
}

// IsEventCancelled checks for the marker left by CancelEvent.
func IsEventCancelled(event *beat.Event) bool {
if event == nil || event.Meta == nil {
return false
}
v, err := event.Meta.GetValue(EventCancelledMetaKey)
return err == nil && v == true
}
80 changes: 44 additions & 36 deletions heartbeat/monitors/task.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (

"github.com/pkg/errors"

"github.com/elastic/beats/heartbeat/eventext"
"github.com/elastic/beats/heartbeat/monitors/jobs"
"github.com/elastic/beats/heartbeat/scheduler"
"github.com/elastic/beats/heartbeat/scheduler/schedule"
Expand Down Expand Up @@ -86,42 +87,7 @@ func (e ProcessorsError) Error() string {

func (t *configuredJob) prepareSchedulerJob(job jobs.Job) scheduler.TaskFunc {
return func() []scheduler.TaskFunc {
event := &beat.Event{
Fields: common.MapStr{},
}
next, err := job(event)
hasContinuations := len(next) > 0

if err != nil {
logp.Err("Job %v failed with: ", err)
}

if event != nil && event.Fields != nil {
// If continuations are present we defensively publish a clone of the event
// in the chance that the event shares underlying data with the events for continuations
// This prevents races where the pipeline publish could accidentally alter multiple events.
if hasContinuations {
clone := beat.Event{
Timestamp: event.Timestamp,
Meta: event.Meta.Clone(),
Fields: event.Fields.Clone(),
}
t.client.Publish(clone)
} else {
// no clone needed if no continuations
t.client.Publish(*event)
}
}

if len(next) == 0 {
return nil
}

continuations := make([]scheduler.TaskFunc, len(next))
for i, n := range next {
continuations[i] = t.prepareSchedulerJob(n)
}
return continuations
return runPublishJob(job, t.client)
}
}

Expand Down Expand Up @@ -166,3 +132,45 @@ func (t *configuredJob) Stop() {
t.client.Close()
}
}

func runPublishJob(job jobs.Job, client beat.Client) []scheduler.TaskFunc {
event := &beat.Event{
Fields: common.MapStr{},
}

next, err := job(event)
if err != nil {
logp.Err("Job %v failed with: ", err)
}

hasContinuations := len(next) > 0

if event.Fields != nil && !eventext.IsEventCancelled(event) {
// If continuations are present we defensively publish a clone of the event
// in the chance that the event shares underlying data with the events for continuations
// This prevents races where the pipeline publish could accidentally alter multiple events.
if hasContinuations {
clone := beat.Event{
Timestamp: event.Timestamp,
Meta: event.Meta.Clone(),
Fields: event.Fields.Clone(),
}
client.Publish(clone)
} else {
// no clone needed if no continuations
client.Publish(*event)
}
}

if len(next) == 0 {
return nil
}

continuations := make([]scheduler.TaskFunc, len(next))
for i, n := range next {
continuations[i] = func() []scheduler.TaskFunc {
return runPublishJob(n, client)
}
}
return continuations
}
96 changes: 96 additions & 0 deletions heartbeat/monitors/task_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
// Licensed to Elasticsearch B.V. under one or more contributor
// license agreements. See the NOTICE file distributed with
// this work for additional information regarding copyright
// ownership. Elasticsearch B.V. licenses this file to you under
// the Apache License, Version 2.0 (the "License"); you may
// not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

package monitors

import (
"testing"

"github.com/stretchr/testify/require"

"github.com/elastic/beats/heartbeat/eventext"
"github.com/elastic/beats/heartbeat/monitors/jobs"
"github.com/elastic/beats/libbeat/beat"
"github.com/elastic/beats/libbeat/common"
"github.com/elastic/beats/libbeat/common/mapval"
)

func Test_runPublishJob(t *testing.T) {
simpleJob := func(event *beat.Event) (j []jobs.Job, e error) {
eventext.MergeEventFields(event, common.MapStr{"foo": "bar"})
return nil, nil
}

testCases := []struct {
name string
job jobs.Job
validators []mapval.Validator
}{
{
"simple",
simpleJob,
[]mapval.Validator{
mapval.MustCompile(mapval.Map{"foo": "bar"}),
},
},
{
"one cont",
func(event *beat.Event) (j []jobs.Job, e error) {
simpleJob(event)
return []jobs.Job{simpleJob}, nil
},
[]mapval.Validator{
mapval.MustCompile(mapval.Map{"foo": "bar"}),
mapval.MustCompile(mapval.Map{"foo": "bar"}),
},
},
{
"cancelled cont",
func(event *beat.Event) (j []jobs.Job, e error) {
eventext.CancelEvent(event)
return []jobs.Job{simpleJob}, nil
},
[]mapval.Validator{
mapval.MustCompile(mapval.Map{"foo": "bar"}),
},
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
client := &MockBeatClient{}
queue := runPublishJob(tc.job, client)
for {
if len(queue) == 0 {
break
}
tf := queue[0]
queue = queue[1:]
conts := tf()
for _, cont := range conts {
queue = append(queue, cont)
}
}
client.Close()

require.Len(t, client.publishes, len(tc.validators))
for idx, event := range client.publishes {
mapval.Test(t, tc.validators[idx], event.Fields)
}
})
}
}
7 changes: 6 additions & 1 deletion heartbeat/monitors/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"net"
"time"

"github.com/elastic/beats/heartbeat/eventext"
"github.com/elastic/beats/heartbeat/look"
"github.com/elastic/beats/heartbeat/monitors/jobs"
"github.com/elastic/beats/heartbeat/monitors/wrappers"
Expand Down Expand Up @@ -211,7 +212,11 @@ func makeByHostAllIPJob(
ipFields := resolveIPEvent(ip.String(), resolveRTT)
cont[i] = wrappers.WithFields(ipFields, pingFactory(addr))
}
return cont, nil
// Ideally we would test this invocation. This function however is really hard to to test given all the extra context it takes in
// In a future refactor we could perhaps test that this in correctly invoked.
eventext.CancelEvent(event)

return cont, err
}
}

Expand Down
17 changes: 11 additions & 6 deletions heartbeat/monitors/wrappers/monitors.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,13 +155,18 @@ func makeAddSummary() jobs.JobWrapper {
state.mtx.Lock()
defer state.mtx.Unlock()

// After each job
eventStatus, _ := event.GetValue("monitor.status")
if eventStatus == "up" {
state.up++
} else {
state.down++
// If the event is cancelled we don't record it as being either up or down since
// we discard the event anyway.
if !eventext.IsEventCancelled(event) {
// After each job
eventStatus, _ := event.GetValue("monitor.status")
if eventStatus == "up" {
state.up++
} else {
state.down++
}
}

// No error check needed here
event.PutValue("monitor.check_group", state.checkGroup)

Expand Down
Loading

0 comments on commit ca315ef

Please sign in to comment.