-
Notifications
You must be signed in to change notification settings - Fork 133
/
telemetry.go
306 lines (265 loc) · 14.1 KB
/
telemetry.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
package statsd
import (
"fmt"
"sync"
"time"
)
/*
telemetryInterval is the interval at which telemetry will be sent by the client.
*/
const telemetryInterval = 10 * time.Second
/*
clientTelemetryTag is a tag identifying this specific client.
*/
var clientTelemetryTag = "client:go"
/*
clientVersionTelemetryTag is a tag identifying this specific client version.
*/
var clientVersionTelemetryTag = "client_version:5.4.0"
// Telemetry represents internal metrics about the client behavior since it started.
type Telemetry struct {
//
// Those are produced by the 'Client'
//
// TotalMetrics is the total number of metrics sent by the client before aggregation and sampling.
TotalMetrics uint64
// TotalMetricsGauge is the total number of gauges sent by the client before aggregation and sampling.
TotalMetricsGauge uint64
// TotalMetricsCount is the total number of counts sent by the client before aggregation and sampling.
TotalMetricsCount uint64
// TotalMetricsHistogram is the total number of histograms sent by the client before aggregation and sampling.
TotalMetricsHistogram uint64
// TotalMetricsDistribution is the total number of distributions sent by the client before aggregation and
// sampling.
TotalMetricsDistribution uint64
// TotalMetricsSet is the total number of sets sent by the client before aggregation and sampling.
TotalMetricsSet uint64
// TotalMetricsTiming is the total number of timings sent by the client before aggregation and sampling.
TotalMetricsTiming uint64
// TotalEvents is the total number of events sent by the client before aggregation and sampling.
TotalEvents uint64
// TotalServiceChecks is the total number of service_checks sent by the client before aggregation and sampling.
TotalServiceChecks uint64
// TotalDroppedOnReceive is the total number metrics/event/service_checks dropped when using ChannelMode (see
// WithChannelMode option).
TotalDroppedOnReceive uint64
//
// Those are produced by the 'sender'
//
// TotalPayloadsSent is the total number of payload (packet on the network) succesfully sent by the client. When
// using UDP we don't know if packet dropped or not, so all packet are considered as succesfully sent.
TotalPayloadsSent uint64
// TotalPayloadsDropped is the total number of payload dropped by the client. This includes all cause of dropped
// (TotalPayloadsDroppedQueueFull and TotalPayloadsDroppedWriter). When using UDP This won't includes the
// network dropped.
TotalPayloadsDropped uint64
// TotalPayloadsDroppedWriter is the total number of payload dropped by the writer (when using UDS or named
// pipe) due to network timeout or error.
TotalPayloadsDroppedWriter uint64
// TotalPayloadsDroppedQueueFull is the total number of payload dropped internally because the queue of payloads
// waiting to be sent on the wire is full. This means the client is generating more metrics than can be sent on
// the wire. If your app sends metrics in batch look at WithSenderQueueSize option to increase the queue size.
TotalPayloadsDroppedQueueFull uint64
// TotalBytesSent is the total number of bytes succesfully sent by the client. When using UDP we don't know if
// packet dropped or not, so all packet are considered as succesfully sent.
TotalBytesSent uint64
// TotalBytesDropped is the total number of bytes dropped by the client. This includes all cause of dropped
// (TotalBytesDroppedQueueFull and TotalBytesDroppedWriter). When using UDP This
// won't includes the network dropped.
TotalBytesDropped uint64
// TotalBytesDroppedWriter is the total number of bytes dropped by the writer (when using UDS or named pipe) due
// to network timeout or error.
TotalBytesDroppedWriter uint64
// TotalBytesDroppedQueueFull is the total number of bytes dropped internally because the queue of payloads
// waiting to be sent on the wire is full. This means the client is generating more metrics than can be sent on
// the wire. If your app sends metrics in batch look at WithSenderQueueSize option to increase the queue size.
TotalBytesDroppedQueueFull uint64
//
// Those are produced by the 'aggregator'
//
// AggregationNbContext is the total number of contexts flushed by the aggregator when either
// WithClientSideAggregation or WithExtendedClientSideAggregation options are enabled.
AggregationNbContext uint64
// AggregationNbContextGauge is the total number of contexts for gauges flushed by the aggregator when either
// WithClientSideAggregation or WithExtendedClientSideAggregation options are enabled.
AggregationNbContextGauge uint64
// AggregationNbContextCount is the total number of contexts for counts flushed by the aggregator when either
// WithClientSideAggregation or WithExtendedClientSideAggregation options are enabled.
AggregationNbContextCount uint64
// AggregationNbContextSet is the total number of contexts for sets flushed by the aggregator when either
// WithClientSideAggregation or WithExtendedClientSideAggregation options are enabled.
AggregationNbContextSet uint64
// AggregationNbContextHistogram is the total number of contexts for histograms flushed by the aggregator when either
// WithClientSideAggregation or WithExtendedClientSideAggregation options are enabled.
AggregationNbContextHistogram uint64
// AggregationNbContextDistribution is the total number of contexts for distributions flushed by the aggregator when either
// WithClientSideAggregation or WithExtendedClientSideAggregation options are enabled.
AggregationNbContextDistribution uint64
// AggregationNbContextTiming is the total number of contexts for timings flushed by the aggregator when either
// WithClientSideAggregation or WithExtendedClientSideAggregation options are enabled.
AggregationNbContextTiming uint64
}
type telemetryClient struct {
sync.RWMutex // used mostly to change the transport tag.
c *Client
aggEnabled bool // is aggregation enabled and should we sent aggregation telemetry.
transport string
tags []string
tagsByType map[metricType][]string
transportTagKnown bool
sender *sender
worker *worker
lastSample Telemetry // The previous sample of telemetry sent
}
func newTelemetryClient(c *Client, aggregationEnabled bool) *telemetryClient {
t := &telemetryClient{
c: c,
aggEnabled: aggregationEnabled,
tags: []string{},
tagsByType: map[metricType][]string{},
}
t.setTags()
return t
}
func newTelemetryClientWithCustomAddr(c *Client, telemetryAddr string, aggregationEnabled bool, pool *bufferPool,
writeTimeout time.Duration, connectTimeout time.Duration,
) (*telemetryClient, error) {
telemetryWriter, _, err := createWriter(telemetryAddr, writeTimeout, connectTimeout)
if err != nil {
return nil, fmt.Errorf("Could not resolve telemetry address: %v", err)
}
t := newTelemetryClient(c, aggregationEnabled)
// Creating a custom sender/worker with 1 worker in mutex mode for the
// telemetry that share the same bufferPool.
// FIXME due to performance pitfall, we're always using UDP defaults
// even for UDS.
t.sender = newSender(telemetryWriter, DefaultUDPBufferPoolSize, pool, c.errorHandler)
t.worker = newWorker(pool, t.sender)
return t, nil
}
func (t *telemetryClient) run(wg *sync.WaitGroup, stop chan struct{}) {
wg.Add(1)
go func() {
defer wg.Done()
ticker := time.NewTicker(telemetryInterval)
for {
select {
case <-ticker.C:
t.sendTelemetry()
case <-stop:
ticker.Stop()
if t.sender != nil {
t.sender.close()
}
return
}
}
}()
}
func (t *telemetryClient) sendTelemetry() {
for _, m := range t.flush() {
if t.worker != nil {
t.worker.processMetric(m)
} else {
t.c.send(m)
}
}
if t.worker != nil {
t.worker.flush()
}
}
func (t *telemetryClient) getTelemetry() Telemetry {
if t == nil {
// telemetry was disabled through the WithoutTelemetry option
return Telemetry{}
}
tlm := Telemetry{}
t.c.flushTelemetryMetrics(&tlm)
t.c.sender.flushTelemetryMetrics(&tlm)
t.c.agg.flushTelemetryMetrics(&tlm)
tlm.TotalMetrics = tlm.TotalMetricsGauge +
tlm.TotalMetricsCount +
tlm.TotalMetricsSet +
tlm.TotalMetricsHistogram +
tlm.TotalMetricsDistribution +
tlm.TotalMetricsTiming
tlm.TotalPayloadsDropped = tlm.TotalPayloadsDroppedQueueFull + tlm.TotalPayloadsDroppedWriter
tlm.TotalBytesDropped = tlm.TotalBytesDroppedQueueFull + tlm.TotalBytesDroppedWriter
if t.aggEnabled {
tlm.AggregationNbContext = tlm.AggregationNbContextGauge +
tlm.AggregationNbContextCount +
tlm.AggregationNbContextSet +
tlm.AggregationNbContextHistogram +
tlm.AggregationNbContextDistribution +
tlm.AggregationNbContextTiming
}
return tlm
}
// setTransportTag if it was never set and is now known.
func (t *telemetryClient) setTags() {
transport := t.c.GetTransport()
t.RLock()
// We need to refresh if we never set the tags or if the transport changed.
// For example when `unix://` is used we might return `uds` until we actually connect and detect that
// this is a UDS Stream socket and then return `uds-stream`.
needsRefresh := len(t.tags) == len(t.c.tags) || t.transport != transport
t.RUnlock()
if !needsRefresh {
return
}
t.Lock()
defer t.Unlock()
t.transport = transport
t.tags = append(t.c.tags, clientTelemetryTag, clientVersionTelemetryTag)
if transport != "" {
t.tags = append(t.tags, "client_transport:"+transport)
}
t.tagsByType[gauge] = append(append([]string{}, t.tags...), "metrics_type:gauge")
t.tagsByType[count] = append(append([]string{}, t.tags...), "metrics_type:count")
t.tagsByType[set] = append(append([]string{}, t.tags...), "metrics_type:set")
t.tagsByType[timing] = append(append([]string{}, t.tags...), "metrics_type:timing")
t.tagsByType[histogram] = append(append([]string{}, t.tags...), "metrics_type:histogram")
t.tagsByType[distribution] = append(append([]string{}, t.tags...), "metrics_type:distribution")
}
// flushTelemetry returns Telemetry metrics to be flushed. It's its own function to ease testing.
func (t *telemetryClient) flush() []metric {
m := []metric{}
// same as Count but without global namespace
telemetryCount := func(name string, value int64, tags []string) {
m = append(m, metric{metricType: count, name: name, ivalue: value, tags: tags, rate: 1})
}
tlm := t.getTelemetry()
t.setTags()
// We send the diff between now and the previous telemetry flush. This keep the same telemetry behavior from V4
// so users dashboard's aren't broken when upgrading to V5. It also allow to graph on the same dashboard a mix
// of V4 and V5 apps.
telemetryCount("datadog.dogstatsd.client.metrics", int64(tlm.TotalMetrics-t.lastSample.TotalMetrics), t.tags)
telemetryCount("datadog.dogstatsd.client.metrics_by_type", int64(tlm.TotalMetricsGauge-t.lastSample.TotalMetricsGauge), t.tagsByType[gauge])
telemetryCount("datadog.dogstatsd.client.metrics_by_type", int64(tlm.TotalMetricsCount-t.lastSample.TotalMetricsCount), t.tagsByType[count])
telemetryCount("datadog.dogstatsd.client.metrics_by_type", int64(tlm.TotalMetricsHistogram-t.lastSample.TotalMetricsHistogram), t.tagsByType[histogram])
telemetryCount("datadog.dogstatsd.client.metrics_by_type", int64(tlm.TotalMetricsDistribution-t.lastSample.TotalMetricsDistribution), t.tagsByType[distribution])
telemetryCount("datadog.dogstatsd.client.metrics_by_type", int64(tlm.TotalMetricsSet-t.lastSample.TotalMetricsSet), t.tagsByType[set])
telemetryCount("datadog.dogstatsd.client.metrics_by_type", int64(tlm.TotalMetricsTiming-t.lastSample.TotalMetricsTiming), t.tagsByType[timing])
telemetryCount("datadog.dogstatsd.client.events", int64(tlm.TotalEvents-t.lastSample.TotalEvents), t.tags)
telemetryCount("datadog.dogstatsd.client.service_checks", int64(tlm.TotalServiceChecks-t.lastSample.TotalServiceChecks), t.tags)
telemetryCount("datadog.dogstatsd.client.metric_dropped_on_receive", int64(tlm.TotalDroppedOnReceive-t.lastSample.TotalDroppedOnReceive), t.tags)
telemetryCount("datadog.dogstatsd.client.packets_sent", int64(tlm.TotalPayloadsSent-t.lastSample.TotalPayloadsSent), t.tags)
telemetryCount("datadog.dogstatsd.client.packets_dropped", int64(tlm.TotalPayloadsDropped-t.lastSample.TotalPayloadsDropped), t.tags)
telemetryCount("datadog.dogstatsd.client.packets_dropped_queue", int64(tlm.TotalPayloadsDroppedQueueFull-t.lastSample.TotalPayloadsDroppedQueueFull), t.tags)
telemetryCount("datadog.dogstatsd.client.packets_dropped_writer", int64(tlm.TotalPayloadsDroppedWriter-t.lastSample.TotalPayloadsDroppedWriter), t.tags)
telemetryCount("datadog.dogstatsd.client.bytes_dropped", int64(tlm.TotalBytesDropped-t.lastSample.TotalBytesDropped), t.tags)
telemetryCount("datadog.dogstatsd.client.bytes_sent", int64(tlm.TotalBytesSent-t.lastSample.TotalBytesSent), t.tags)
telemetryCount("datadog.dogstatsd.client.bytes_dropped_queue", int64(tlm.TotalBytesDroppedQueueFull-t.lastSample.TotalBytesDroppedQueueFull), t.tags)
telemetryCount("datadog.dogstatsd.client.bytes_dropped_writer", int64(tlm.TotalBytesDroppedWriter-t.lastSample.TotalBytesDroppedWriter), t.tags)
if t.aggEnabled {
telemetryCount("datadog.dogstatsd.client.aggregated_context", int64(tlm.AggregationNbContext-t.lastSample.AggregationNbContext), t.tags)
telemetryCount("datadog.dogstatsd.client.aggregated_context_by_type", int64(tlm.AggregationNbContextGauge-t.lastSample.AggregationNbContextGauge), t.tagsByType[gauge])
telemetryCount("datadog.dogstatsd.client.aggregated_context_by_type", int64(tlm.AggregationNbContextSet-t.lastSample.AggregationNbContextSet), t.tagsByType[set])
telemetryCount("datadog.dogstatsd.client.aggregated_context_by_type", int64(tlm.AggregationNbContextCount-t.lastSample.AggregationNbContextCount), t.tagsByType[count])
telemetryCount("datadog.dogstatsd.client.aggregated_context_by_type", int64(tlm.AggregationNbContextHistogram-t.lastSample.AggregationNbContextHistogram), t.tagsByType[histogram])
telemetryCount("datadog.dogstatsd.client.aggregated_context_by_type", int64(tlm.AggregationNbContextDistribution-t.lastSample.AggregationNbContextDistribution), t.tagsByType[distribution])
telemetryCount("datadog.dogstatsd.client.aggregated_context_by_type", int64(tlm.AggregationNbContextTiming-t.lastSample.AggregationNbContextTiming), t.tagsByType[timing])
}
t.lastSample = tlm
return m
}