diff --git a/CHANGELOG.md b/CHANGELOG.md index 50fb9d54dcbd..06cc1ed82b5b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -34,6 +34,7 @@ - `examples`: Add an example for scraping Couchbase metrics (#10894) - `filestorageextension`: Add background compaction capability (#9327) - `googlecloudpubsubreceiver`: Added new `Endpoint` and `Insecure` connection configuration options. (#10845) +- `dynatraceexporter`: Provide better estimated summaries for partial histograms. (#11044) - `mongodbreceiver`: Add integration test for mongodb receiver (#10864) - `mezmoexporter`: add logging for HTTP errors (#10875) - `signalfxexporter`: Enable the exporting of seven Kubernetes metrics used in Splunk/SignalFx content by default (#11032) diff --git a/exporter/dynatraceexporter/internal/serialization/histogram.go b/exporter/dynatraceexporter/internal/serialization/histogram.go index f543e6d43b7b..a3b8b4c699bc 100644 --- a/exporter/dynatraceexporter/internal/serialization/histogram.go +++ b/exporter/dynatraceexporter/internal/serialization/histogram.go @@ -36,19 +36,14 @@ func serializeHistogram(name, prefix string, dims dimensions.NormalizedDimension return "", nil } - min, max := estimateHistMinMax(dp) + min, max, sum := histDataPointToSummary(dp) dm, err := dtMetric.NewMetric( name, dtMetric.WithPrefix(prefix), dtMetric.WithDimensions(dims), dtMetric.WithTimestamp(dp.Timestamp().AsTime()), - dtMetric.WithFloatSummaryValue( - min, - max, - dp.Sum(), - int64(dp.Count()), - ), + dtMetric.WithFloatSummaryValue(min, max, sum, int64(dp.Count())), ) if err != nil { @@ -58,58 +53,87 @@ func serializeHistogram(name, prefix string, dims dimensions.NormalizedDimension return dm.Serialize() } -// estimateHistMinMax returns the estimated minimum and maximum value in the histogram by using the min and max non-empty buckets. -func estimateHistMinMax(dp pmetric.HistogramDataPoint) (float64, float64) { +// histDataPointToSummary returns the estimated minimum and maximum value in the histogram by using the min and max non-empty buckets. +// It MAY NOT be called with a data point with dp.Count() == 0. +func histDataPointToSummary(dp pmetric.HistogramDataPoint) (float64, float64, float64) { bounds := dp.MExplicitBounds() counts := dp.MBucketCounts() - // shortcut in the case both max and min are provided - if dp.HasMin() && dp.HasMax() { - return dp.Min(), dp.Max() + // shortcut if min, max, and sum are provided + if dp.HasMin() && dp.HasMax() && dp.HasSum() { + return dp.Min(), dp.Max(), dp.Sum() } - // Because we do not know the actual min and max, we estimate them based on the min and max non-empty bucket - minIdx, maxIdx := -1, -1 - for y := 0; y < len(counts); y++ { - if counts[y] > 0 { - if minIdx == -1 { - minIdx = y - } - maxIdx = y - } + // a single-bucket histogram is a special case + if len(counts) == 1 { + return estimateSingleBucketHistogram(dp) } - if minIdx == -1 || maxIdx == -1 { - return 0, 0 - } + // If any of min, max, sum is not provided in the data point, + // loop through the buckets to estimate them. + // All three values are estimated in order to avoid looping multiple times + // or complicating the loop with branches. After the loop, estimates + // will be overridden with any values provided by the data point. + foundNonEmptyBucket := false + var min, max, sum float64 = 0, 0, 0 + + // Because we do not know the actual min, max, or sum, we estimate them based on non-empty buckets + for i := 0; i < len(counts); i++ { + // empty bucket + if counts[i] == 0 { + continue + } - var min, max float64 + // range for bucket counts[i] is bounds[i-1] to bounds[i] - if dp.HasMin() { - min = dp.Min() - } else { - // Use lower bound for min unless it is the first bucket which has no lower bound, then use upper - if minIdx == 0 { - min = bounds[minIdx] + // min estimation + if !foundNonEmptyBucket { + foundNonEmptyBucket = true + if i == 0 { + // if we're in the first bucket, the best estimate we can make for min is the upper bound + min = bounds[i] + } else { + min = bounds[i-1] + } + } + + // max estimation + if i == len(counts)-1 { + // if we're in the last bucket, the best estimate we can make for max is the lower bound + max = bounds[i-1] } else { - min = bounds[minIdx-1] + max = bounds[i] + } + + // sum estimation + switch i { + case 0: + // in the first bucket, estimate sum using the upper bound + sum += float64(counts[i]) * bounds[i] + case len(counts) - 1: + // in the last bucket, estimate sum using the lower bound + sum += float64(counts[i]) * bounds[i-1] + default: + // in any other bucket, estimate sum using the bucket midpoint + sum += float64(counts[i]) * (bounds[i] + bounds[i-1]) / 2 } } + // Override estimates with any values provided by the data point + if dp.HasMin() { + min = dp.Min() + } if dp.HasMax() { max = dp.Max() - } else { - // Use upper bound for max unless it is the last bucket which has no upper bound, then use lower - if maxIdx == len(counts)-1 { - max = bounds[maxIdx-1] - } else { - max = bounds[maxIdx] - } + } + if dp.HasSum() { + sum = dp.Sum() } // Set min to average when higher than average. This can happen when most values are lower than first boundary (falling in first bucket). // Set max to average when lower than average. This can happen when most values are higher than last boundary (falling in last bucket). - avg := dp.Sum() / float64(dp.Count()) + // dp.Count() will never be zero + avg := sum / float64(dp.Count()) if min > avg { min = avg } @@ -117,5 +141,29 @@ func estimateHistMinMax(dp pmetric.HistogramDataPoint) (float64, float64) { max = avg } - return min, max + return min, max, sum +} + +func estimateSingleBucketHistogram(dp pmetric.HistogramDataPoint) (float64, float64, float64) { + min, max, sum := 0.0, 0.0, 0.0 + + if dp.HasSum() { + sum = dp.Sum() + } + + mean := sum / float64(dp.Count()) + + if dp.HasMin() { + min = dp.Min() + } else { + min = mean + } + + if dp.HasMax() { + max = dp.Max() + } else { + max = mean + } + + return min, max, sum } diff --git a/exporter/dynatraceexporter/internal/serialization/histogram_test.go b/exporter/dynatraceexporter/internal/serialization/histogram_test.go index 562e5e71f621..d076af2f6695 100644 --- a/exporter/dynatraceexporter/internal/serialization/histogram_test.go +++ b/exporter/dynatraceexporter/internal/serialization/histogram_test.go @@ -131,4 +131,197 @@ func Test_serializeHistogram(t *testing.T) { assert.NoError(t, err) assert.Equal(t, "prefix.min_max_hist gauge,min=3,max=7,sum=10,count=2 1626438600000", got) }) + + t.Run("when min is not provided it should be estimated", func(t *testing.T) { + t.Run("values between first two boundaries", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5}) + hist.SetMBucketCounts([]uint64{0, 1, 0, 3, 2, 0}) + hist.SetCount(6) + hist.SetSum(21.2) + + min, _, _ := histDataPointToSummary(hist) + + assert.Equal(t, 1.0, min, "use bucket min") + }) + + t.Run("first bucket has value", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5}) + hist.SetMBucketCounts([]uint64{1, 0, 0, 3, 0, 4}) + hist.SetCount(8) + hist.SetSum(34.5) + + min, _, _ := histDataPointToSummary(hist) + + assert.Equal(t, 1.0, min, "use the first boundary as estimation instead of Inf") + }) + + t.Run("only the first bucket has values, use the mean", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5}) + hist.SetMBucketCounts([]uint64{3, 0, 0, 0, 0, 0}) + hist.SetCount(3) + hist.SetSum(0.75) + + min, _, _ := histDataPointToSummary(hist) + + assert.Equal(t, 0.25, min) + }) + t.Run("just one bucket from -Inf to Inf", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{}) + hist.SetMBucketCounts([]uint64{4}) + hist.SetCount(4) + hist.SetSum(8.8) + + min, _, _ := histDataPointToSummary(hist) + + assert.Equal(t, 2.2, min, "calculate the mean as min value") + }) + t.Run("just one bucket from -Inf to Inf", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{}) + hist.SetMBucketCounts([]uint64{1}) + hist.SetCount(1) + hist.SetSum(1.2) + + min, _, _ := histDataPointToSummary(hist) + + assert.Equal(t, 1.2, min, "calculate the mean as min value") + }) + t.Run("only the last bucket has a value", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5}) + hist.SetMBucketCounts([]uint64{0, 0, 0, 0, 0, 3}) + hist.SetCount(3) + hist.SetSum(15.6) + + min, _, _ := histDataPointToSummary(hist) + + assert.Equal(t, 5.0, min, "use the lower bound") + }) + }) + + t.Run("when max is not provided it should be estimated", func(t *testing.T) { + t.Run("values between the last two boundaries", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5}) + hist.SetMBucketCounts([]uint64{0, 1, 0, 3, 2, 0}) + hist.SetSum(21.2) + hist.SetCount(6) + + _, max, _ := histDataPointToSummary(hist) + + assert.Equal(t, 5.0, max, "use bucket max") + }) + + t.Run("last bucket has value", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5}) + hist.SetMBucketCounts([]uint64{1, 0, 0, 3, 0, 4}) + hist.SetSum(34.5) + hist.SetCount(8) + + _, max, _ := histDataPointToSummary(hist) + + assert.Equal(t, 5.0, max, "use the last boundary as estimation instead of Inf") + }) + + t.Run("only the last bucket has values", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5}) + hist.SetMBucketCounts([]uint64{0, 0, 0, 0, 0, 2}) + hist.SetSum(20.2) + hist.SetCount(2) + + _, max, _ := histDataPointToSummary(hist) + + assert.Equal(t, 10.1, max, "use the mean (10.1) Otherwise, the max would be estimated as 5, and max >= avg would be violated") + }) + + t.Run("just one bucket from -Inf to Inf", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{}) + hist.SetMBucketCounts([]uint64{4}) + hist.SetSum(8.8) + hist.SetCount(4) + + _, max, _ := histDataPointToSummary(hist) + + assert.Equal(t, 2.2, max, "calculate the mean as max value") + }) + + t.Run("just one bucket from -Inf to Inf", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{}) + hist.SetMBucketCounts([]uint64{1}) + hist.SetSum(1.2) + hist.SetCount(1) + + _, max, _ := histDataPointToSummary(hist) + + assert.Equal(t, 1.2, max, "calculate the mean as max value") + }) + + t.Run("max is larger than sum", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{0, 5}) + hist.SetMBucketCounts([]uint64{0, 2, 0}) + hist.SetSum(2.3) + hist.SetCount(2) + + _, max, _ := histDataPointToSummary(hist) + + assert.Equal(t, 5.0, max, "use the estimated boundary") + }) + }) + + t.Run("when sum is not provided it should be estimated", func(t *testing.T) { + t.Run("single bucket histogram", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{}) + hist.SetMBucketCounts([]uint64{13}) + hist.SetCount(6) + + _, _, sum := histDataPointToSummary(hist) + + assert.Equal(t, 0.0, sum, "estimate zero (midpoint of [-Inf, Inf])") + }) + + t.Run("data in bounded buckets", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5}) + hist.SetMBucketCounts([]uint64{0, 3, 5, 0, 0, 0}) + hist.SetCount(6) + + _, _, sum := histDataPointToSummary(hist) + + assert.Equal(t, 3*1.5+5*2.5, sum, "estimate sum using bucket midpoints") + }) + + t.Run("data in unbounded buckets", func(t *testing.T) { + t.Run("first bucket", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5}) + hist.SetMBucketCounts([]uint64{2, 3, 5, 0, 0, 0}) + hist.SetCount(6) + + _, _, sum := histDataPointToSummary(hist) + + assert.Equal(t, 1*2+3*1.5+5*2.5, sum, "use bucket upper bound") + }) + + t.Run("last bucket", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5}) + hist.SetMBucketCounts([]uint64{0, 3, 5, 0, 0, 2}) + hist.SetCount(6) + + _, _, sum := histDataPointToSummary(hist) + + assert.Equal(t, 3*1.5+5*2.5+2*5, sum, "use bucket upper bound") + }) + }) + }) }