Skip to content

Commit

Permalink
Feature: add retries to ICMP prober (#896)
Browse files Browse the repository at this point in the history
Add a retry mechanism to the ICMP prober. If the packet count is not
set, use 3 packets instead of 1, and require that at least 1 of them
gets a response. This should cover most reasonable scenarios where
retries are needed.

Note that this will send out 3 packets regardless of whether 1 is
already received, so this has an impact on the probe_duration_seconds
metric (because the probe takes longer to run).

If the time for an individual packet is important, use
`sum(probe_icmp_duration_seconds)` instead. That metric is split by
phases, and the `rtt` phase is the average round-trip-time for all the
received packets. The other two phases, `setup` and `resolve` only run
once.

The resolve part of this also retries, in the event that the DNS server
fails to produce a response and there's an indication that the request
should be retried (e.g. if the response is "not found", that is not
retried).

Signed-off-by: Marcelo E. Magallon <[email protected]>
  • Loading branch information
mem authored Sep 19, 2024
1 parent 8389e94 commit 49cd928
Show file tree
Hide file tree
Showing 4 changed files with 184 additions and 80 deletions.
21 changes: 14 additions & 7 deletions internal/prober/icmp/icmp.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,13 @@ import (
var errUnsupportedCheck = errors.New("unsupported check")

type Module struct {
Prober string
Timeout time.Duration
PacketCount int64
ICMP config.ICMPProbe
Privileged bool
Prober string
Timeout time.Duration
PacketCount int64
ReqSuccessCount int64
MaxResolveRetries int64
ICMP config.ICMPProbe
Privileged bool
}

type Prober struct {
Expand Down Expand Up @@ -62,12 +64,17 @@ func settingsToModule(settings *sm.PingSettings) Module {

m.ICMP.DontFragment = settings.DontFragment

if settings.PacketCount <= 1 {
m.PacketCount = 1
if settings.PacketCount == 0 {
m.PacketCount = 3
m.ReqSuccessCount = 1
} else {
m.PacketCount = settings.PacketCount
m.ReqSuccessCount = settings.PacketCount // TODO(mem): expose this setting
}

// TODO(mem): add a setting for this
m.MaxResolveRetries = 3

return m
}

Expand Down
8 changes: 4 additions & 4 deletions internal/prober/icmp/icmp_impl.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,9 @@ func probeICMP(ctx context.Context, target string, module Module, registry *prom
registry.MustRegister(packetsSentGauge)
registry.MustRegister(packetsReceivedGauge)

dstIPAddr, lookupTime, err := chooseProtocol(ctx, module.ICMP.IPProtocol, module.ICMP.IPProtocolFallback, target, registry, logger)
dstIPAddr, lookupTime, err := chooseProtocol(ctx, module.ICMP.IPProtocol, module.ICMP.IPProtocolFallback, target, int(module.MaxResolveRetries), registry, logger)
if err != nil {
_ = level.Warn(logger).Log("msg", "Error resolving address", "err", err)
_ = level.Error(logger).Log("msg", "Error resolving address", "err", err)
return false
}

Expand All @@ -74,7 +74,7 @@ func probeICMP(ctx context.Context, target string, module Module, registry *prom
pinger.SetPrivileged(module.Privileged)

if err := pinger.Resolve(); err != nil {
// This should never happe, the address is already resolved.
// This should never happen, the address is already resolved.
_ = level.Error(logger).Log("msg", "Error resolving address", "err", err)
return false
}
Expand Down Expand Up @@ -151,7 +151,7 @@ func probeICMP(ctx context.Context, target string, module Module, registry *prom
return false
}

return pinger.Count == pinger.PacketsSent && pinger.PacketsRecv == pinger.PacketsSent
return pinger.PacketsSent >= int(module.ReqSuccessCount) && pinger.PacketsRecv >= int(module.ReqSuccessCount)
}

type icmpLogger struct {
Expand Down
94 changes: 77 additions & 17 deletions internal/prober/icmp/icmp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,62 @@ func TestNewProber(t *testing.T) {
},
expected: Prober{
config: Module{
Prober: "ping",
Timeout: 0,
PacketCount: 1,
Privileged: isPrivilegedRequired(),
Prober: "ping",
Timeout: 0,
PacketCount: 3,
ReqSuccessCount: 1,
MaxResolveRetries: 3,
Privileged: isPrivilegedRequired(),
ICMP: config.ICMPProbe{
IPProtocol: "ip6",
IPProtocolFallback: true,
},
},
},
ExpectError: false,
},
"1 packet": {
input: sm.Check{
Target: "www.grafana.com",
Settings: sm.CheckSettings{
Ping: &sm.PingSettings{
PacketCount: 1,
},
},
},
expected: Prober{
config: Module{
Prober: "ping",
Timeout: 0,
PacketCount: 1,
ReqSuccessCount: 1,
MaxResolveRetries: 3,
Privileged: isPrivilegedRequired(),
ICMP: config.ICMPProbe{
IPProtocol: "ip6",
IPProtocolFallback: true,
},
},
},
ExpectError: false,
},
"2 packets": {
input: sm.Check{
Target: "www.grafana.com",
Settings: sm.CheckSettings{
Ping: &sm.PingSettings{
PacketCount: 2,
},
},
},
expected: Prober{
config: Module{
Prober: "ping",
Timeout: 0,
PacketCount: 2,
ReqSuccessCount: 2,
MaxResolveRetries: 3,
Privileged: isPrivilegedRequired(),
ICMP: config.ICMPProbe{
IPProtocol: "ip6",
IPProtocolFallback: true,
Expand All @@ -50,7 +102,7 @@ func TestNewProber(t *testing.T) {
input: sm.Check{
Target: "www.grafana.com",
Settings: sm.CheckSettings{
Http: nil,
Ping: nil,
},
},
expected: Prober{},
Expand Down Expand Up @@ -79,9 +131,11 @@ func TestSettingsToModule(t *testing.T) {
"default": {
input: sm.PingSettings{},
expected: Module{
Prober: "ping",
Timeout: 0,
PacketCount: 1,
Prober: "ping",
Timeout: 0,
PacketCount: 3,
ReqSuccessCount: 1,
MaxResolveRetries: 3,
ICMP: config.ICMPProbe{
IPProtocol: "ip6",
IPProtocolFallback: true,
Expand All @@ -94,9 +148,11 @@ func TestSettingsToModule(t *testing.T) {
SourceIpAddress: "0.0.0.0",
},
expected: Module{
Prober: "ping",
Timeout: 0,
PacketCount: 1,
Prober: "ping",
Timeout: 0,
PacketCount: 3,
ReqSuccessCount: 1,
MaxResolveRetries: 3,
ICMP: config.ICMPProbe{
IPProtocol: "ip4",
IPProtocolFallback: false,
Expand All @@ -110,9 +166,11 @@ func TestSettingsToModule(t *testing.T) {
PacketCount: 1,
},
expected: Module{
Prober: "ping",
Timeout: 0,
PacketCount: 1,
Prober: "ping",
Timeout: 0,
PacketCount: 1,
ReqSuccessCount: 1,
MaxResolveRetries: 3,
ICMP: config.ICMPProbe{
IPProtocol: "ip4",
IPProtocolFallback: false,
Expand All @@ -125,9 +183,11 @@ func TestSettingsToModule(t *testing.T) {
PacketCount: 2,
},
expected: Module{
Prober: "ping",
Timeout: 0,
PacketCount: 2,
Prober: "ping",
Timeout: 0,
PacketCount: 2,
ReqSuccessCount: 2,
MaxResolveRetries: 3,
ICMP: config.ICMPProbe{
IPProtocol: "ip4",
IPProtocolFallback: false,
Expand Down
141 changes: 89 additions & 52 deletions internal/prober/icmp/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ var protocolToGauge = map[string]float64{
}

// Returns the IP for the ipProtocol and lookup time.
func chooseProtocol(ctx context.Context, ipProtocol string, fallbackIPProtocol bool, target string, registry *prometheus.Registry, logger log.Logger) (ip *net.IPAddr, lookupTime float64, err error) {
func chooseProtocol(ctx context.Context, ipProtocol string, fallbackIPProtocol bool, target string, retries int, registry *prometheus.Registry, logger log.Logger) (ip *net.IPAddr, lookupTime float64, err error) {
var fallbackProtocol string
probeDNSLookupTimeSeconds := prometheus.NewGauge(prometheus.GaugeOpts{
Name: "probe_dns_lookup_time_seconds",
Expand Down Expand Up @@ -69,73 +69,110 @@ func chooseProtocol(ctx context.Context, ipProtocol string, fallbackIPProtocol b
}()

resolver := &net.Resolver{}
if !fallbackIPProtocol {
ips, err := resolver.LookupIP(ctx, ipProtocol, target)
if err == nil {
for _, ip := range ips {
_ = level.Info(logger).Log("msg", "Resolved target address", "ip", ip.String())
probeIPProtocolGauge.Set(protocolToGauge[ipProtocol])
probeIPAddrHash.Set(ipHash(ip))
return &net.IPAddr{IP: ip}, lookupTime, nil

for ; retries >= 0; retries-- {
if !fallbackIPProtocol {
ips, err := resolver.LookupIP(ctx, ipProtocol, target)
if err == nil {
for _, ip := range ips {
_ = level.Info(logger).Log("msg", "Resolved target address", "ip", ip.String())
probeIPProtocolGauge.Set(protocolToGauge[ipProtocol])
probeIPAddrHash.Set(ipHash(ip))
return &net.IPAddr{IP: ip}, lookupTime, nil
}
}
_ = level.Warn(logger).Log("msg", "Resolution with IP protocol failed", "err", err)

if isRetryableError(err) {
continue
} else {
break
}
}
_ = level.Error(logger).Log("msg", "Resolution with IP protocol failed", "err", err)
return nil, 0.0, err
}

ips, err := resolver.LookupIPAddr(ctx, target)
if err != nil {
_ = level.Error(logger).Log("msg", "Resolution with IP protocol failed", "err", err)
return nil, 0.0, err
}
ips, err := resolver.LookupIPAddr(ctx, target)
if err != nil {
_ = level.Warn(logger).Log("msg", "Resolution with IP protocol failed", "err", err)

// Return the IP in the requested protocol.
fallbackIdx := int(-1)
for i, ip := range ips {
switch ipProtocol {
case "ip4":
if ip.IP.To4() != nil {
_ = level.Info(logger).Log("msg", "Resolved target address", "ip", ip.String())
probeIPProtocolGauge.Set(4)
probeIPAddrHash.Set(ipHash(ip.IP))
return &ip, lookupTime, nil
if isRetryableError(err) {
continue
} else {
break
}
}

// ip4 as fallback
fallbackIdx = i

case "ip6":
if ip.IP.To4() == nil {
_ = level.Info(logger).Log("msg", "Resolved target address", "ip", ip.String())
probeIPProtocolGauge.Set(6)
probeIPAddrHash.Set(ipHash(ip.IP))
return &ip, lookupTime, nil
// Return the IP in the requested protocol.
fallbackIdx := int(-1)
for i, ip := range ips {
switch ipProtocol {
case "ip4":
if ip.IP.To4() != nil {
_ = level.Info(logger).Log("msg", "Resolved target address", "ip", ip.String())
probeIPProtocolGauge.Set(4)
probeIPAddrHash.Set(ipHash(ip.IP))
return &ip, lookupTime, nil
}

// ip4 as fallback
fallbackIdx = i

case "ip6":
if ip.IP.To4() == nil {
_ = level.Info(logger).Log("msg", "Resolved target address", "ip", ip.String())
probeIPProtocolGauge.Set(6)
probeIPAddrHash.Set(ipHash(ip.IP))
return &ip, lookupTime, nil
}

// ip6 as fallback
fallbackIdx = i
}
}

// ip6 as fallback
fallbackIdx = i
// Unable to find ip and no fallback set.
if fallbackIdx == -1 || !fallbackIPProtocol {
_ = level.Error(logger).Log("msg", "unable to find ip; no fallback")
break
}
}

// Unable to find ip and no fallback set.
if fallbackIdx == -1 || !fallbackIPProtocol {
return nil, 0.0, fmt.Errorf("unable to find ip; no fallback")
// Use fallback ip protocol.
if fallbackProtocol == "ip4" {
probeIPProtocolGauge.Set(4)
} else {
probeIPProtocolGauge.Set(6)
}
fallback := ips[fallbackIdx]
probeIPAddrHash.Set(ipHash(fallback.IP))
_ = level.Info(logger).Log("msg", "Resolved target address", "ip", fallback.String())
return &fallback, lookupTime, nil
}

// Use fallback ip protocol.
if fallbackProtocol == "ip4" {
probeIPProtocolGauge.Set(4)
} else {
probeIPProtocolGauge.Set(6)
}
fallback := ips[fallbackIdx]
probeIPAddrHash.Set(ipHash(fallback.IP))
_ = level.Info(logger).Log("msg", "Resolved target address", "ip", fallback.String())
return &fallback, lookupTime, nil
return nil, 0.0, fmt.Errorf("unable to find ip")
}

func ipHash(ip net.IP) float64 {
h := fnv.New32a()
_, _ = h.Write(ip)
return float64(h.Sum32())
}

func isRetryableError(err error) bool {
if err == nil {
return false
}

// Retry on timeouts.
if ne, ok := err.(net.Error); ok && ne.Timeout() {
return true
}

if dnsErr, ok := err.(*net.DNSError); ok {
// Retry on DNS lookup errors.
if dnsErr.IsTimeout || dnsErr.IsTemporary {
return true
}
}

// Don't retry on other errors.
return false
}

0 comments on commit 49cd928

Please sign in to comment.