Skip to content

Commit

Permalink
ER 2908: Telegraf adding delay between retries for snmp availability (i…
Browse files Browse the repository at this point in the history
  • Loading branch information
sandeep-vunet authored Jun 2, 2022
1 parent bab4dda commit 8292ce4
Showing 1 changed file with 50 additions and 36 deletions.
86 changes: 50 additions & 36 deletions plugins/inputs/snmp_heartbeat/snmp_heartbeat.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,9 @@ type Snmp struct {
initialized bool
nativePingFunc NativePingFunc
// Packet size
Size *int `toml:"size"`
Size *int `toml:"size"`
ProcessTimeout int64 `toml:"whole_process_wait_time"`
ProcessRetries int `toml:"whole_process_retry_count"`
}

func (s *Snmp) init() error {
Expand Down Expand Up @@ -344,9 +346,11 @@ func init() {
Version: 2,
Community: "public",
},
Packets: 3,
Alias: "",
Icmp: false,
Packets: 3,
Alias: "",
Icmp: false,
ProcessTimeout: 10,
ProcessRetries: 0,
}
s.nativePingFunc = s.nativePing
return s
Expand Down Expand Up @@ -400,40 +404,50 @@ func (s *Snmp) Gather(acc telegraf.Accumulator) error {
period_str := strings.Trim(s.Period, "s")
period, err = strconv.Atoi(period_str)
}
if err := s.gatherTable(acc, gs, t, topTags, false, period); err != nil {
if s.Probe {
rt := RTable{
Name: t.Name,
Time: time.Now(), //TODO record time at start
Rows: make([]RTableRow, 0, 1),
}
rtr := RTableRow{}
rtr.Tags = map[string]string{}
rtr.Fields = map[string]interface{}{}
if _, ok := rtr.Tags[s.AgentHostTag]; !ok {
rtr.Tags[s.AgentHostTag] = gs.Host()
}
if !s.Icmp {
rtr.Fields["method"] = "snmp"
rtr.Fields["target_state"] = "Down"
rtr.Fields["average_rtt"] = 0
rtr.Fields["minimum_rtt"] = 0
rtr.Fields["maximum_rtt"] = 0
rtr.Fields["jitter"] = 0
rtr.Fields["pkt_loss_pct"] = 100
rtr.Fields["period"] = period
rtr.Fields["snmp_error_description"] = err.Error()

rt.Rows = append(rt.Rows, rtr)

acc.AddFields(rt.Name, rtr.Fields, rtr.Tags, rt.Time)
for retries := int64(0); retries <= int64(s.ProcessRetries); retries++ {
if err := s.gatherTable(acc, gs, t, topTags, false, period); err != nil {
if s.Probe {
rt := RTable{
Name: t.Name,
Time: time.Now(), //TODO record time at start
Rows: make([]RTableRow, 0, 1),
}
rtr := RTableRow{}
rtr.Tags = map[string]string{}
rtr.Fields = map[string]interface{}{}
if _, ok := rtr.Tags[s.AgentHostTag]; !ok {
rtr.Tags[s.AgentHostTag] = gs.Host()
}
if !s.Icmp {
if retries < int64(s.ProcessRetries) {
log.Printf("%s state down retrying after %d seconds", gs.Host(), s.ProcessTimeout)
time.Sleep(time.Duration(s.ProcessTimeout) * time.Second)
continue
}
rtr.Fields["method"] = "snmp"
rtr.Fields["target_state"] = "Down"
rtr.Fields["average_rtt"] = 0
rtr.Fields["minimum_rtt"] = 0
rtr.Fields["maximum_rtt"] = 0
rtr.Fields["jitter"] = 0
rtr.Fields["pkt_loss_pct"] = 100
rtr.Fields["period"] = period
rtr.Fields["snmp_error_description"] = err.Error()
rt.Rows = append(rt.Rows, rtr)
acc.AddFields(rt.Name, rtr.Fields, rtr.Tags, rt.Time)
} else {
fields := s.pingToURLNative(gs.Host(), acc, period)
if retries < int64(s.ProcessRetries) && fields["target_state"] == "Down" {
log.Printf("%s state down retrying after %d seconds", gs.Host(), s.ProcessTimeout)
time.Sleep(time.Duration(s.ProcessTimeout) * time.Second)
continue
}
fields["snmp_error_description"] = err.Error()
acc.AddFields(rt.Name, fields, rtr.Tags, rt.Time)
}
} else {
fields := s.pingToURLNative(gs.Host(), acc, period)
fields["snmp_error_description"] = err.Error()
acc.AddFields(rt.Name, fields, rtr.Tags, rt.Time)
acc.AddError(fmt.Errorf("agent %s: %w", agent, err))
}
} else {
acc.AddError(fmt.Errorf("agent %s: %w", agent, err))
}
}

Expand Down

0 comments on commit 8292ce4

Please sign in to comment.