Skip to content

Commit

Permalink
[Heartbeat] Monitor Retries (#36147)
Browse files Browse the repository at this point in the history
Adds retries to Heartbeat monitors. Part of elastic/synthetics#792

This refactors a ton of code around summarizing events, and cleans up a lot of tech debt as well.
  • Loading branch information
andrewvc authored Aug 31, 2023
1 parent ac5f806 commit a6bae85
Show file tree
Hide file tree
Showing 69 changed files with 1,098 additions and 562 deletions.
2 changes: 1 addition & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
// Use 'postCreateCommand' to run commands after the container is created.
// Mage is installed this way, and not via the feature plugin because that plugin was
// broken for me, and mage install is simple enough
"postCreateCommand": "cd /opt/; sudo mkdir mage; sudo chown $USER:$(id -g) mage; git clone --depth=1 https://github.com/magefile/mage && cd mage && go run bootstrap.go"
"postCreateCommand": "cd /opt/; sudo mkdir mage; sudo chown $USER:$(id -g) mage; git clone --depth=1 https://github.com/magefile/mage && cd mage && go run bootstrap.go; npm i -g @elastic/synthetics; sudo env \"PATH=$PATH\" npx -yes playwright install-deps"

// Configure tool-specific properties.
// "customizations": {},
Expand Down
3 changes: 3 additions & 0 deletions .golangci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
run:
# timeout for analysis, e.g. 30s, 5m, default is 1m
timeout: 15m
build-tags:
- synthetics
- integration

issues:
# Maximum count of issues with the same text.
Expand Down
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ def withTools(Map args = [:], Closure body) {
body()
}
} else if (args.get('nodejs', false)) {
withNodeJSEnv() {
withNodeJSEnv(version: '18.17.1') {
withEnv(["ELASTIC_SYNTHETICS_CAPABLE=true"]) {
cmd(label: "Install @elastic/synthetics", script: "npm i -g @elastic/synthetics")
body()
Expand Down
20 changes: 20 additions & 0 deletions dev-tools/mage/gotest.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,25 @@ func DefaultGoTestUnitArgs() GoTestArgs { return makeGoTestArgs("Unit") }
func DefaultGoTestIntegrationArgs() GoTestArgs {
args := makeGoTestArgs("Integration")
args.Tags = append(args.Tags, "integration")

synth := exec.Command("npx", "@elastic/synthetics", "-h")
if synth.Run() == nil {
// Run an empty journey to ensure playwright can be loaded
// catches situations like missing playwright deps
cmd := exec.Command("sh", "-c", "echo 'step(\"t\", () => { })' | elastic-synthetics --inline")
var out strings.Builder
cmd.Stdout = &out
cmd.Stderr = &out
err := cmd.Run()
if err != nil || cmd.ProcessState.ExitCode() != 0 {
fmt.Printf("synthetics is available, but not invokable, command exited with bad code: %s\n", out.String())
}

fmt.Println("npx @elastic/synthetics found, will run with synthetics tags")
os.Setenv("ELASTIC_SYNTHETICS_CAPABLE", "true")
args.Tags = append(args.Tags, "synthetics")
}

// Use the non-cachable -count=1 flag to disable test caching when running integration tests.
// There are reasons to re-run tests even if the code is unchanged (e.g. Dockerfile changes).
args.ExtraFlags = append(args.ExtraFlags, "-count=1")
Expand All @@ -125,6 +144,7 @@ func DefaultGoTestIntegrationFromHostArgs() GoTestArgs {
// module integration tests. We tag integration test files with 'integration'.
func GoTestIntegrationArgsForModule(module string) GoTestArgs {
args := makeGoTestArgsForModule("Integration", module)

args.Tags = append(args.Tags, "integration")
return args
}
Expand Down
11 changes: 10 additions & 1 deletion dev-tools/mage/target/unittest/unittest.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ package unittest
import (
"context"
"fmt"
"os/exec"

"github.com/magefile/mage/mg"

Expand Down Expand Up @@ -55,7 +56,15 @@ func UnitTest() {
// Use RACE_DETECTOR=true to enable the race detector.
func GoUnitTest(ctx context.Context) error {
mg.SerialCtxDeps(ctx, goTestDeps...)
return devtools.GoTest(ctx, devtools.DefaultGoTestUnitArgs())

utArgs := devtools.DefaultGoTestUnitArgs()
// If synthetics is installed run synthetics unit tests
synth := exec.Command("npx", "@elastic/synthetics", "-h")
if synth.Run() == nil {
fmt.Printf("npx @elastic/synthetics found, will run with synthetics tags")
utArgs.Tags = append(utArgs.Tags, "synthetics")
}
return devtools.GoTest(ctx, utArgs)
}

// PythonUnitTest executes the python system tests.
Expand Down
20 changes: 20 additions & 0 deletions heartbeat/_meta/fields.common.yml
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,26 @@
type: integer
description: >
The number of endpoints that failed
- name: status
type: keyword
description: >
The status of this check as a whole. Either up or down.
- name: attempt
type: short
description: >
When performing a check this number is 1 for the first check, and increments in the event of a retry.
- name: max_attempts
type: short
description: >
The maximum number of checks that may be performed. Note, the actual number may be smaller.
- name: final_attempt
type: boolean
description: >
True if no further checks will be performed in this retry group.
- name: retry_group
type: keyword
description: >
A unique token used to group checks across attempts.
- key: service
title: "APM Service"
description:
Expand Down
2 changes: 1 addition & 1 deletion heartbeat/autodiscover/builder/hints/monitors.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ func (hb *heartbeatHints) getHostsWithPort(hints mapstr.M, port int, podEvent bo
return nil, fmt.Errorf("no hosts selected for port %d with hints: %+v", port, thosts)
}

var result []string
result := make([]string, 0, len(hostSet))
for host := range hostSet {
result = append(result, host)
}
Expand Down
50 changes: 50 additions & 0 deletions heartbeat/docs/fields.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -16427,6 +16427,56 @@ type: integer
--
*`summary.status`*::
+
--
The status of this check as a whole. Either up or down.
type: keyword
--
*`summary.attempt`*::
+
--
When performing a check this number is 1 for the first check, and increments in the event of a retry.
type: short
--
*`summary.max_attempts`*::
+
--
The maximum number of checks that may be performed. Note, the actual number may be smaller.
type: short
--
*`summary.final_attempt`*::
+
--
True if no further checks will be performed in this retry group.
type: boolean
--
*`summary.retry_group`*::
+
--
A unique token used to group checks across attempts.
type: keyword
--
[[exported-fields-synthetics]]
== Synthetics types fields
Expand Down
29 changes: 12 additions & 17 deletions heartbeat/hbtest/hbtestutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import (

"github.com/elastic/beats/v7/heartbeat/ecserr"
"github.com/elastic/beats/v7/heartbeat/monitors/active/dialchain/tlsmeta"
"github.com/elastic/beats/v7/heartbeat/monitors/wrappers/summarizer/summarizertesthelper"

"github.com/elastic/beats/v7/heartbeat/hbtestllext"

Expand Down Expand Up @@ -142,17 +143,13 @@ func TLSChecks(chainIndex, certIndex int, certificate *x509.Certificate) validat
PeerCertificates: []*x509.Certificate{certificate},
}, time.Duration(1))

//nolint:errcheck // There are no new changes to this line but
// linter has been activated in the meantime. We'll cleanup separately.
expected.Put("tls.rtt.handshake.us", hbtestllext.IsInt64)
_, _ = expected.Put("tls.rtt.handshake.us", hbtestllext.IsInt64)

// Generally, the exact cipher will match, but on windows 7 32bit this is not true!
// We don't actually care about the exact cipher matching, since we're not testing the TLS
// implementation, we trust go there, just that most of the metadata is present
if runtime.GOOS == "windows" && bits.UintSize == 32 {
//nolint:errcheck // There are no new changes to this line but
// linter has been activated in the meantime. We'll cleanup separately.
expected.Put("tls.cipher", isdef.IsString)
_, _ = expected.Put("tls.cipher", isdef.IsString)
}

return lookslike.MustCompile(expected)
Expand Down Expand Up @@ -190,15 +187,14 @@ func BaseChecks(ip string, status string, typ string) validator.Validator {
)
}

// SummaryChecks validates the "summary" + "state" fields
func SummaryChecks(up int, down int) validator.Validator {
return lookslike.MustCompile(map[string]interface{}{
"summary": map[string]interface{}{
"up": uint16(up),
"down": uint16(down),
},
"state": hbtestllext.IsMonitorState,
})
// SummaryStateChecks validates the "summary" + "state" fields
func SummaryStateChecks(up uint16, down uint16) validator.Validator {
return lookslike.Compose(
summarizertesthelper.SummaryValidator(up, down),
lookslike.MustCompile(map[string]interface{}{
"state": hbtestllext.IsMonitorState,
}),
)
}

// ResolveChecks returns a lookslike matcher for the 'resolve' fields.
Expand Down Expand Up @@ -289,8 +285,7 @@ func StartHTTPSServer(t *testing.T, tlsCert tls.Certificate) (host string, port
require.NoError(t, err)

// No need to start a real server, since this is invalid, we just
//nolint:gosec // There are no new changes to this line but
// linter has been activated in the meantime. We'll cleanup separately.
//nolint:gosec // it's a test, sec issues don't apply
l, err := tls.Listen("tcp", "127.0.0.1:0", &tls.Config{
Certificates: []tls.Certificate{tlsCert},
})
Expand Down
2 changes: 1 addition & 1 deletion heartbeat/include/fields.go

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion heartbeat/monitors/active/dialchain/dialchain.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ func (c *DialerChain) Clone() *DialerChain {
func (c *DialerChain) Build(event *beat.Event) (d transport.Dialer, err error) {
d, err = c.Net.build(event)
if err != nil {
return
return d, err
}

for _, layer := range c.Layers {
Expand Down
Loading

0 comments on commit a6bae85

Please sign in to comment.