Skip to content

Commit

Permalink
feat(dynatraceexporter): provide better estimated summaries for parti…
Browse files Browse the repository at this point in the history
…al histograms (#11044)

* feat(dynatraceexporter): provide better estimated summaries for partial histograms

* Update changelog

* Lint

* dynatraceexporter: test sum estimation

* Clarification comments

* dynatraceexporter: Clarify count may not be zero for histogram estimation
  • Loading branch information
dyladan authored Jun 20, 2022
1 parent 8e90b72 commit ccfee86
Show file tree
Hide file tree
Showing 3 changed files with 283 additions and 41 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
- `examples`: Add an example for scraping Couchbase metrics (#10894)
- `filestorageextension`: Add background compaction capability (#9327)
- `googlecloudpubsubreceiver`: Added new `Endpoint` and `Insecure` connection configuration options. (#10845)
- `dynatraceexporter`: Provide better estimated summaries for partial histograms. (#11044)
- `mongodbreceiver`: Add integration test for mongodb receiver (#10864)
- `mezmoexporter`: add logging for HTTP errors (#10875)
- `signalfxexporter`: Enable the exporting of seven Kubernetes metrics used in Splunk/SignalFx content by default (#11032)
Expand Down
130 changes: 89 additions & 41 deletions exporter/dynatraceexporter/internal/serialization/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,14 @@ func serializeHistogram(name, prefix string, dims dimensions.NormalizedDimension
return "", nil
}

min, max := estimateHistMinMax(dp)
min, max, sum := histDataPointToSummary(dp)

dm, err := dtMetric.NewMetric(
name,
dtMetric.WithPrefix(prefix),
dtMetric.WithDimensions(dims),
dtMetric.WithTimestamp(dp.Timestamp().AsTime()),
dtMetric.WithFloatSummaryValue(
min,
max,
dp.Sum(),
int64(dp.Count()),
),
dtMetric.WithFloatSummaryValue(min, max, sum, int64(dp.Count())),
)

if err != nil {
Expand All @@ -58,64 +53,117 @@ func serializeHistogram(name, prefix string, dims dimensions.NormalizedDimension
return dm.Serialize()
}

// estimateHistMinMax returns the estimated minimum and maximum value in the histogram by using the min and max non-empty buckets.
func estimateHistMinMax(dp pmetric.HistogramDataPoint) (float64, float64) {
// histDataPointToSummary returns the estimated minimum and maximum value in the histogram by using the min and max non-empty buckets.
// It MAY NOT be called with a data point with dp.Count() == 0.
func histDataPointToSummary(dp pmetric.HistogramDataPoint) (float64, float64, float64) {
bounds := dp.MExplicitBounds()
counts := dp.MBucketCounts()

// shortcut in the case both max and min are provided
if dp.HasMin() && dp.HasMax() {
return dp.Min(), dp.Max()
// shortcut if min, max, and sum are provided
if dp.HasMin() && dp.HasMax() && dp.HasSum() {
return dp.Min(), dp.Max(), dp.Sum()
}

// Because we do not know the actual min and max, we estimate them based on the min and max non-empty bucket
minIdx, maxIdx := -1, -1
for y := 0; y < len(counts); y++ {
if counts[y] > 0 {
if minIdx == -1 {
minIdx = y
}
maxIdx = y
}
// a single-bucket histogram is a special case
if len(counts) == 1 {
return estimateSingleBucketHistogram(dp)
}

if minIdx == -1 || maxIdx == -1 {
return 0, 0
}
// If any of min, max, sum is not provided in the data point,
// loop through the buckets to estimate them.
// All three values are estimated in order to avoid looping multiple times
// or complicating the loop with branches. After the loop, estimates
// will be overridden with any values provided by the data point.
foundNonEmptyBucket := false
var min, max, sum float64 = 0, 0, 0

// Because we do not know the actual min, max, or sum, we estimate them based on non-empty buckets
for i := 0; i < len(counts); i++ {
// empty bucket
if counts[i] == 0 {
continue
}

var min, max float64
// range for bucket counts[i] is bounds[i-1] to bounds[i]

if dp.HasMin() {
min = dp.Min()
} else {
// Use lower bound for min unless it is the first bucket which has no lower bound, then use upper
if minIdx == 0 {
min = bounds[minIdx]
// min estimation
if !foundNonEmptyBucket {
foundNonEmptyBucket = true
if i == 0 {
// if we're in the first bucket, the best estimate we can make for min is the upper bound
min = bounds[i]
} else {
min = bounds[i-1]
}
}

// max estimation
if i == len(counts)-1 {
// if we're in the last bucket, the best estimate we can make for max is the lower bound
max = bounds[i-1]
} else {
min = bounds[minIdx-1]
max = bounds[i]
}

// sum estimation
switch i {
case 0:
// in the first bucket, estimate sum using the upper bound
sum += float64(counts[i]) * bounds[i]
case len(counts) - 1:
// in the last bucket, estimate sum using the lower bound
sum += float64(counts[i]) * bounds[i-1]
default:
// in any other bucket, estimate sum using the bucket midpoint
sum += float64(counts[i]) * (bounds[i] + bounds[i-1]) / 2
}
}

// Override estimates with any values provided by the data point
if dp.HasMin() {
min = dp.Min()
}
if dp.HasMax() {
max = dp.Max()
} else {
// Use upper bound for max unless it is the last bucket which has no upper bound, then use lower
if maxIdx == len(counts)-1 {
max = bounds[maxIdx-1]
} else {
max = bounds[maxIdx]
}
}
if dp.HasSum() {
sum = dp.Sum()
}

// Set min to average when higher than average. This can happen when most values are lower than first boundary (falling in first bucket).
// Set max to average when lower than average. This can happen when most values are higher than last boundary (falling in last bucket).
avg := dp.Sum() / float64(dp.Count())
// dp.Count() will never be zero
avg := sum / float64(dp.Count())
if min > avg {
min = avg
}
if max < avg {
max = avg
}

return min, max
return min, max, sum
}

func estimateSingleBucketHistogram(dp pmetric.HistogramDataPoint) (float64, float64, float64) {
min, max, sum := 0.0, 0.0, 0.0

if dp.HasSum() {
sum = dp.Sum()
}

mean := sum / float64(dp.Count())

if dp.HasMin() {
min = dp.Min()
} else {
min = mean
}

if dp.HasMax() {
max = dp.Max()
} else {
max = mean
}

return min, max, sum
}
193 changes: 193 additions & 0 deletions exporter/dynatraceexporter/internal/serialization/histogram_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,4 +131,197 @@ func Test_serializeHistogram(t *testing.T) {
assert.NoError(t, err)
assert.Equal(t, "prefix.min_max_hist gauge,min=3,max=7,sum=10,count=2 1626438600000", got)
})

t.Run("when min is not provided it should be estimated", func(t *testing.T) {
t.Run("values between first two boundaries", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
hist.SetMBucketCounts([]uint64{0, 1, 0, 3, 2, 0})
hist.SetCount(6)
hist.SetSum(21.2)

min, _, _ := histDataPointToSummary(hist)

assert.Equal(t, 1.0, min, "use bucket min")
})

t.Run("first bucket has value", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
hist.SetMBucketCounts([]uint64{1, 0, 0, 3, 0, 4})
hist.SetCount(8)
hist.SetSum(34.5)

min, _, _ := histDataPointToSummary(hist)

assert.Equal(t, 1.0, min, "use the first boundary as estimation instead of Inf")
})

t.Run("only the first bucket has values, use the mean", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
hist.SetMBucketCounts([]uint64{3, 0, 0, 0, 0, 0})
hist.SetCount(3)
hist.SetSum(0.75)

min, _, _ := histDataPointToSummary(hist)

assert.Equal(t, 0.25, min)
})
t.Run("just one bucket from -Inf to Inf", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{})
hist.SetMBucketCounts([]uint64{4})
hist.SetCount(4)
hist.SetSum(8.8)

min, _, _ := histDataPointToSummary(hist)

assert.Equal(t, 2.2, min, "calculate the mean as min value")
})
t.Run("just one bucket from -Inf to Inf", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{})
hist.SetMBucketCounts([]uint64{1})
hist.SetCount(1)
hist.SetSum(1.2)

min, _, _ := histDataPointToSummary(hist)

assert.Equal(t, 1.2, min, "calculate the mean as min value")
})
t.Run("only the last bucket has a value", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
hist.SetMBucketCounts([]uint64{0, 0, 0, 0, 0, 3})
hist.SetCount(3)
hist.SetSum(15.6)

min, _, _ := histDataPointToSummary(hist)

assert.Equal(t, 5.0, min, "use the lower bound")
})
})

t.Run("when max is not provided it should be estimated", func(t *testing.T) {
t.Run("values between the last two boundaries", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
hist.SetMBucketCounts([]uint64{0, 1, 0, 3, 2, 0})
hist.SetSum(21.2)
hist.SetCount(6)

_, max, _ := histDataPointToSummary(hist)

assert.Equal(t, 5.0, max, "use bucket max")
})

t.Run("last bucket has value", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
hist.SetMBucketCounts([]uint64{1, 0, 0, 3, 0, 4})
hist.SetSum(34.5)
hist.SetCount(8)

_, max, _ := histDataPointToSummary(hist)

assert.Equal(t, 5.0, max, "use the last boundary as estimation instead of Inf")
})

t.Run("only the last bucket has values", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
hist.SetMBucketCounts([]uint64{0, 0, 0, 0, 0, 2})
hist.SetSum(20.2)
hist.SetCount(2)

_, max, _ := histDataPointToSummary(hist)

assert.Equal(t, 10.1, max, "use the mean (10.1) Otherwise, the max would be estimated as 5, and max >= avg would be violated")
})

t.Run("just one bucket from -Inf to Inf", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{})
hist.SetMBucketCounts([]uint64{4})
hist.SetSum(8.8)
hist.SetCount(4)

_, max, _ := histDataPointToSummary(hist)

assert.Equal(t, 2.2, max, "calculate the mean as max value")
})

t.Run("just one bucket from -Inf to Inf", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{})
hist.SetMBucketCounts([]uint64{1})
hist.SetSum(1.2)
hist.SetCount(1)

_, max, _ := histDataPointToSummary(hist)

assert.Equal(t, 1.2, max, "calculate the mean as max value")
})

t.Run("max is larger than sum", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{0, 5})
hist.SetMBucketCounts([]uint64{0, 2, 0})
hist.SetSum(2.3)
hist.SetCount(2)

_, max, _ := histDataPointToSummary(hist)

assert.Equal(t, 5.0, max, "use the estimated boundary")
})
})

t.Run("when sum is not provided it should be estimated", func(t *testing.T) {
t.Run("single bucket histogram", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{})
hist.SetMBucketCounts([]uint64{13})
hist.SetCount(6)

_, _, sum := histDataPointToSummary(hist)

assert.Equal(t, 0.0, sum, "estimate zero (midpoint of [-Inf, Inf])")
})

t.Run("data in bounded buckets", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
hist.SetMBucketCounts([]uint64{0, 3, 5, 0, 0, 0})
hist.SetCount(6)

_, _, sum := histDataPointToSummary(hist)

assert.Equal(t, 3*1.5+5*2.5, sum, "estimate sum using bucket midpoints")
})

t.Run("data in unbounded buckets", func(t *testing.T) {
t.Run("first bucket", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
hist.SetMBucketCounts([]uint64{2, 3, 5, 0, 0, 0})
hist.SetCount(6)

_, _, sum := histDataPointToSummary(hist)

assert.Equal(t, 1*2+3*1.5+5*2.5, sum, "use bucket upper bound")
})

t.Run("last bucket", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
hist.SetMBucketCounts([]uint64{0, 3, 5, 0, 0, 2})
hist.SetCount(6)

_, _, sum := histDataPointToSummary(hist)

assert.Equal(t, 3*1.5+5*2.5+2*5, sum, "use bucket upper bound")
})
})
})
}

0 comments on commit ccfee86

Please sign in to comment.