-
Notifications
You must be signed in to change notification settings - Fork 4.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Heartbeat] Defer monitor / ICMP errors to monitor runtime / ES #29413
Changes from 11 commits
b49e341
6645da7
19e3834
3545c47
1907a1e
a09dd3d
7481feb
c5e48e6
11b01d1
9c03f7e
3be2013
e9db6b3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,7 +20,6 @@ package icmp | |
import ( | ||
"bytes" | ||
"encoding/binary" | ||
"errors" | ||
"fmt" | ||
"math/rand" | ||
"net" | ||
|
@@ -159,29 +158,6 @@ func newICMPLoop() (*stdICMPLoop, error) { | |
return l, nil | ||
} | ||
|
||
func (l *stdICMPLoop) checkNetworkMode(mode string) error { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Moved to main loop icmp recv so we don't interfere with heartbeat initialization. |
||
ip4, ip6 := false, false | ||
switch mode { | ||
case "ip4": | ||
ip4 = true | ||
case "ip6": | ||
ip6 = true | ||
case "ip": | ||
ip4, ip6 = true, true | ||
default: | ||
return fmt.Errorf("'%v' is not supported", mode) | ||
} | ||
|
||
if ip4 && l.conn4 == nil { | ||
return errors.New("failed to initiate IPv4 support. Check log details for permission configuration") | ||
} | ||
if ip6 && l.conn6 == nil { | ||
return errors.New("failed to initiate IPv6 support. Check log details for permission configuration") | ||
} | ||
|
||
return nil | ||
} | ||
|
||
func (l *stdICMPLoop) runICMPRecv(conn *icmp.PacketConn, proto int) { | ||
for { | ||
bytes := make([]byte, 512) | ||
|
@@ -251,6 +227,14 @@ func (l *stdICMPLoop) ping( | |
timeout time.Duration, | ||
interval time.Duration, | ||
) (time.Duration, int, error) { | ||
isIPv6 := addr.IP.To4() == nil | ||
if isIPv6 && l.conn6 == nil { | ||
return -1, -1, fmt.Errorf("cannot ping IPv6 address '%s', no IPv6 connection available", addr) | ||
} | ||
if !isIPv6 && l.conn4 == nil { | ||
return -1, -1, fmt.Errorf("cannot ping IPv4 address '%s', no IPv4 connection available", addr) | ||
} | ||
|
||
var err error | ||
toTimer := time.NewTimer(timeout) | ||
defer toTimer.Stop() | ||
|
@@ -379,7 +363,7 @@ func (l *stdICMPLoop) sendEchoRequest(addr *net.IPAddr) (*requestContext, error) | |
|
||
_, err := conn.WriteTo(encoded, addr) | ||
if err != nil { | ||
return nil, err | ||
return nil, fmt.Errorf("could not write to conn: %w", err) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. more specific error context is always nice |
||
} | ||
|
||
ctx.ts = ts | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -163,13 +163,26 @@ func newMonitorUnsafe( | |
return p.Close() | ||
} | ||
|
||
wrappedJobs := wrappers.WrapCommon(p.Jobs, m.stdFields) | ||
m.endpoints = p.Endpoints | ||
|
||
// If we've hit an error at this point, still run on schedule, but always return an error. | ||
// This way the error is clearly communicated through to kibana. | ||
// Since the error is not recoverable in these instances, the user will need to reconfigure | ||
// the monitor, which will destroy and recreate it in heartbeat, thus clearing this error. | ||
// | ||
// Note: we do this at this point, and no earlier, because at a minimum we need the | ||
// standard monitor fields (id, name and schedule) to deliver an error to kibana in a way | ||
// that it can render. | ||
if err != nil { | ||
return m, fmt.Errorf("job err %v", err) | ||
// Note, needed to hoist err to this scope, not just to add a prefix | ||
fullErr := fmt.Errorf("job could not be initialized: %s", err) | ||
// A placeholder job that always returns an error | ||
p.Jobs = []jobs.Job{func(event *beat.Event) ([]jobs.Job, error) { | ||
return nil, fullErr | ||
}} | ||
Comment on lines
+175
to
+180
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is great! Thanks for the helpful comment on the top too. This monitor initialisation part is really elegant, but having the comment on the top does help a lot given the further wrapping and how the jobs then get queued. |
||
} | ||
|
||
wrappedJobs := wrappers.WrapCommon(p.Jobs, m.stdFields) | ||
m.endpoints = p.Endpoints | ||
|
||
m.configuredJobs, err = m.makeTasks(config, wrappedJobs) | ||
if err != nil { | ||
return m, err | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,7 @@ | |
import sys | ||
import time | ||
import unittest | ||
import re | ||
from beat.beat import INTEGRATION_TESTS | ||
from elasticsearch import Elasticsearch | ||
from heartbeat import BaseTest | ||
|
@@ -44,8 +45,12 @@ def has_failed_message(): return self.log_contains("Failed to initialize ICMP lo | |
# we should run pings on those machines and make sure they work. | ||
self.wait_until(lambda: has_started_message() or has_failed_message(), 30) | ||
|
||
self.wait_until(lambda: self.output_has(lines=1)) | ||
output = self.read_output() | ||
monitor_status = output[0]["monitor.status"] | ||
monitor_error = output[0]["error.message"] | ||
if has_failed_message(): | ||
proc.check_kill_and_wait(1) | ||
assert monitor_status == "down" | ||
self.assertRegex(monitor_error, ".*Insufficient privileges to perform ICMP ping.*") | ||
else: | ||
# Check that documents are moving through | ||
self.wait_until(lambda: self.output_has(lines=1)) | ||
assert monitor_status == "up" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've seen we have the following comment in the code to explain why we have these different assertions, which makes this test really test different things depending on the underlying environment:
However, I wonder whether we should just add a further verification above to check whether we can perform ICMP pings and then we can use that to decide which logs and statuses we'll look for. Right now we use Heartbeat's logs to assert on its behaviour, while the suggestion above would help us use Python to look into the underlying platform and then assert on Heartbeat's behaviour. In other words, even though it would be a conditional test, its conditions would not depend only on Heartbeat. This is not a blocking comment btw, and you can consider this to be outside the scope of this PR if you judge it to be unnecessary for now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I think it's a good point for a follow-up PR perhaps. The only to determine if you can do a ping is to actually do one sadly or maybe to use The tricky thing from a CI perspective is you need to see if you have permission, somehow, to run We probably need this logic regardless because on most people's laptops the no pings behavior will be the default. I'm not averse to us reaching out to robots to fix this, but I don't think it's a priority given the ultra-low churn WRT the ICMP code. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
.synthetics | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🙏 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now checked in stdloop.go, we don't want to check at plugin initialization, just at exec time