From 3735ddaa7793461d3568b3b36e9737ba2b9de7b6 Mon Sep 17 00:00:00 2001 From: Daniel Dyla Date: Tue, 14 Jun 2022 18:31:43 -0400 Subject: [PATCH 1/6] feat(dynatraceexporter): provide better estimated summaries for partial histograms --- .../internal/serialization/histogram.go | 119 +++++++++----- .../internal/serialization/histogram_test.go | 145 ++++++++++++++++++ 2 files changed, 223 insertions(+), 41 deletions(-) diff --git a/exporter/dynatraceexporter/internal/serialization/histogram.go b/exporter/dynatraceexporter/internal/serialization/histogram.go index f543e6d43b7b..d63de0da4fdd 100644 --- a/exporter/dynatraceexporter/internal/serialization/histogram.go +++ b/exporter/dynatraceexporter/internal/serialization/histogram.go @@ -36,19 +36,14 @@ func serializeHistogram(name, prefix string, dims dimensions.NormalizedDimension return "", nil } - min, max := estimateHistMinMax(dp) + min, max, sum := histDataPointToSummary(dp) dm, err := dtMetric.NewMetric( name, dtMetric.WithPrefix(prefix), dtMetric.WithDimensions(dims), dtMetric.WithTimestamp(dp.Timestamp().AsTime()), - dtMetric.WithFloatSummaryValue( - min, - max, - dp.Sum(), - int64(dp.Count()), - ), + dtMetric.WithFloatSummaryValue(min, max, sum, int64(dp.Count())), ) if err != nil { @@ -58,58 +53,76 @@ func serializeHistogram(name, prefix string, dims dimensions.NormalizedDimension return dm.Serialize() } -// estimateHistMinMax returns the estimated minimum and maximum value in the histogram by using the min and max non-empty buckets. -func estimateHistMinMax(dp pmetric.HistogramDataPoint) (float64, float64) { +// histDataPointToSummary returns the estimated minimum and maximum value in the histogram by using the min and max non-empty buckets. +func histDataPointToSummary(dp pmetric.HistogramDataPoint) (float64, float64, float64) { bounds := dp.MExplicitBounds() counts := dp.MBucketCounts() - // shortcut in the case both max and min are provided - if dp.HasMin() && dp.HasMax() { - return dp.Min(), dp.Max() + // shortcut if min, max, and sum are provided + if dp.HasMin() && dp.HasMax() && dp.HasSum() { + return dp.Min(), dp.Max(), dp.Sum() } - // Because we do not know the actual min and max, we estimate them based on the min and max non-empty bucket - minIdx, maxIdx := -1, -1 - for y := 0; y < len(counts); y++ { - if counts[y] > 0 { - if minIdx == -1 { - minIdx = y - } - maxIdx = y - } + // a single-bucket histogram is a special case + if len(counts) == 1 { + return estimateSingleBucketHistogram(dp) } - if minIdx == -1 || maxIdx == -1 { - return 0, 0 - } + foundNonEmptyBucket := false + var min, max, sum float64 = 0, 0, 0 - var min, max float64 + // Because we do not know the actual min, max, or sum, we estimate them based on non-empty buckets + for i := 0; i < len(counts); i++ { + // empty bucket + if counts[i] == 0 { + continue + } - if dp.HasMin() { - min = dp.Min() - } else { - // Use lower bound for min unless it is the first bucket which has no lower bound, then use upper - if minIdx == 0 { - min = bounds[minIdx] + // range for counts[i] is bounds[i-1] to bounds[i] + + // min estimation + if !foundNonEmptyBucket { + foundNonEmptyBucket = true + if i == 0 { + // if we're in the first bucket, the best estimate we can make for min is the upper bound + min = bounds[i] + } else { + min = bounds[i-1] + } + } + + if i == len(counts)-1 { + // if we're in the last bucket, the best estimate we can make for max is the lower bound + max = bounds[i-1] } else { - min = bounds[minIdx-1] + max = bounds[i] + } + + if i == 0 { + // in the first bucket, estimate sum using the upper bound + sum += float64(counts[i]) * bounds[i] + } else if i == len(counts)-1 { + // in the last bucket, estimate sum using the lower bound + sum += float64(counts[i]) * bounds[i-1] + } else { + // in any other bucket, estimate sum using the bucket midpoint + sum += float64(counts[i]) * (bounds[i] + bounds[i-1]) / 2 } } + if dp.HasMin() { + min = dp.Min() + } if dp.HasMax() { max = dp.Max() - } else { - // Use upper bound for max unless it is the last bucket which has no upper bound, then use lower - if maxIdx == len(counts)-1 { - max = bounds[maxIdx-1] - } else { - max = bounds[maxIdx] - } + } + if dp.HasSum() { + sum = dp.Sum() } // Set min to average when higher than average. This can happen when most values are lower than first boundary (falling in first bucket). // Set max to average when lower than average. This can happen when most values are higher than last boundary (falling in last bucket). - avg := dp.Sum() / float64(dp.Count()) + avg := sum / float64(dp.Count()) if min > avg { min = avg } @@ -117,5 +130,29 @@ func estimateHistMinMax(dp pmetric.HistogramDataPoint) (float64, float64) { max = avg } - return min, max + return min, max, sum +} + +func estimateSingleBucketHistogram(dp pmetric.HistogramDataPoint) (float64, float64, float64) { + min, max, sum := 0.0, 0.0, 0.0 + + if dp.HasSum() { + sum = dp.Sum() + } + + mean := sum / float64(dp.Count()) + + if dp.HasMin() { + min = dp.Min() + } else { + min = mean + } + + if dp.HasMax() { + max = dp.Max() + } else { + max = mean + } + + return min, max, sum } diff --git a/exporter/dynatraceexporter/internal/serialization/histogram_test.go b/exporter/dynatraceexporter/internal/serialization/histogram_test.go index 562e5e71f621..2f44c1ccad6e 100644 --- a/exporter/dynatraceexporter/internal/serialization/histogram_test.go +++ b/exporter/dynatraceexporter/internal/serialization/histogram_test.go @@ -131,4 +131,149 @@ func Test_serializeHistogram(t *testing.T) { assert.NoError(t, err) assert.Equal(t, "prefix.min_max_hist gauge,min=3,max=7,sum=10,count=2 1626438600000", got) }) + + t.Run("when min is not provided it should be estimated", func(t *testing.T) { + t.Run("values between first two boundaries", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5}) + hist.SetMBucketCounts([]uint64{0, 1, 0, 3, 2, 0}) + hist.SetCount(6) + hist.SetSum(21.2) + + min, _, _ := histDataPointToSummary(hist) + + assert.Equal(t, 1.0, min, "use bucket min") + }) + + t.Run("first bucket has value", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5}) + hist.SetMBucketCounts([]uint64{1, 0, 0, 3, 0, 4}) + hist.SetCount(8) + hist.SetSum(34.5) + + min, _, _ := histDataPointToSummary(hist) + + assert.Equal(t, 1.0, min, "use the first boundary as estimation instead of Inf") + }) + + t.Run("only the first bucket has values, use the mean", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5}) + hist.SetMBucketCounts([]uint64{3, 0, 0, 0, 0, 0}) + hist.SetCount(3) + hist.SetSum(0.75) + + min, _, _ := histDataPointToSummary(hist) + + assert.Equal(t, 0.25, min) + }) + t.Run("just one bucket from -Inf to Inf", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{}) + hist.SetMBucketCounts([]uint64{4}) + hist.SetCount(4) + hist.SetSum(8.8) + + min, _, _ := histDataPointToSummary(hist) + + assert.Equal(t, 2.2, min, "calculate the mean as min value") + }) + t.Run("just one bucket from -Inf to Inf", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{}) + hist.SetMBucketCounts([]uint64{1}) + hist.SetCount(1) + hist.SetSum(1.2) + + min, _, _ := histDataPointToSummary(hist) + + assert.Equal(t, 1.2, min, "calculate the mean as min value") + }) + t.Run("only the last bucket has a value", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5}) + hist.SetMBucketCounts([]uint64{0, 0, 0, 0, 0, 3}) + hist.SetCount(3) + hist.SetSum(15.6) + + min, _, _ := histDataPointToSummary(hist) + + assert.Equal(t, 5.0, min, "use the lower bound") + }) + }) + + t.Run("when max is not provided it should be estimated", func(t *testing.T) { + t.Run("values between the last two boundaries", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5}) + hist.SetMBucketCounts([]uint64{0, 1, 0, 3, 2, 0}) + hist.SetSum(21.2) + hist.SetCount(6) + + _, max, _ := histDataPointToSummary(hist) + + assert.Equal(t, 5.0, max, "use bucket max") + }) + + t.Run("last bucket has value", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5}) + hist.SetMBucketCounts([]uint64{1, 0, 0, 3, 0, 4}) + hist.SetSum(34.5) + hist.SetCount(8) + + _, max, _ := histDataPointToSummary(hist) + + assert.Equal(t, 5.0, max, "use the last boundary as estimation instead of Inf") + }) + + t.Run("only the last bucket has values", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5}) + hist.SetMBucketCounts([]uint64{0, 0, 0, 0, 0, 2}) + hist.SetSum(20.2) + hist.SetCount(2) + + _, max, _ := histDataPointToSummary(hist) + + assert.Equal(t, 10.1, max, "use the mean (10.1) Otherwise, the max would be estimated as 5, and max >= avg would be violated") + }) + + t.Run("just one bucket from -Inf to Inf", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{}) + hist.SetMBucketCounts([]uint64{4}) + hist.SetSum(8.8) + hist.SetCount(4) + + _, max, _ := histDataPointToSummary(hist) + + assert.Equal(t, 2.2, max, "calculate the mean as max value") + }) + + t.Run("just one bucket from -Inf to Inf", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{}) + hist.SetMBucketCounts([]uint64{1}) + hist.SetSum(1.2) + hist.SetCount(1) + + _, max, _ := histDataPointToSummary(hist) + + assert.Equal(t, 1.2, max, "calculate the mean as max value") + }) + + t.Run("max is larger than sum", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{0, 5}) + hist.SetMBucketCounts([]uint64{0, 2, 0}) + hist.SetSum(2.3) + hist.SetCount(2) + + _, max, _ := histDataPointToSummary(hist) + + assert.Equal(t, 5.0, max, "use the estimated boundary") + }) + }) } From bdd131be190934beedca66329cfd0b5a562df699 Mon Sep 17 00:00:00 2001 From: Daniel Dyla Date: Tue, 14 Jun 2022 18:36:00 -0400 Subject: [PATCH 2/6] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 298d2decb9cd..948aa182f166 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ - `examples`: Add an example for scraping Couchbase metrics (#10894) - `filestorageextension`: Add background compaction capability (#9327) - `googlecloudpubsubreceiver`: Added new `Endpoint` and `Insecure` connection configuration options. (#10845) +- `dynatraceexporter`: Provide better estimated summaries for partial histograms. (#11044) ### 🧰 Bug fixes 🧰 From 7305c8d3fad7699df011e33f4f69feb7cee472d0 Mon Sep 17 00:00:00 2001 From: Daniel Dyla Date: Wed, 15 Jun 2022 08:12:34 -0400 Subject: [PATCH 3/6] Lint --- .../dynatraceexporter/internal/serialization/histogram.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/exporter/dynatraceexporter/internal/serialization/histogram.go b/exporter/dynatraceexporter/internal/serialization/histogram.go index d63de0da4fdd..1d1509631504 100644 --- a/exporter/dynatraceexporter/internal/serialization/histogram.go +++ b/exporter/dynatraceexporter/internal/serialization/histogram.go @@ -98,13 +98,14 @@ func histDataPointToSummary(dp pmetric.HistogramDataPoint) (float64, float64, fl max = bounds[i] } - if i == 0 { + switch i { + case 0: // in the first bucket, estimate sum using the upper bound sum += float64(counts[i]) * bounds[i] - } else if i == len(counts)-1 { + case len(counts) - 1: // in the last bucket, estimate sum using the lower bound sum += float64(counts[i]) * bounds[i-1] - } else { + default: // in any other bucket, estimate sum using the bucket midpoint sum += float64(counts[i]) * (bounds[i] + bounds[i-1]) / 2 } From 6b377d14a3b3f1a480b1c7840cd22d7d8b6d207e Mon Sep 17 00:00:00 2001 From: Daniel Dyla Date: Thu, 16 Jun 2022 10:53:38 -0400 Subject: [PATCH 4/6] dynatraceexporter: test sum estimation --- .../internal/serialization/histogram_test.go | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/exporter/dynatraceexporter/internal/serialization/histogram_test.go b/exporter/dynatraceexporter/internal/serialization/histogram_test.go index 2f44c1ccad6e..d076af2f6695 100644 --- a/exporter/dynatraceexporter/internal/serialization/histogram_test.go +++ b/exporter/dynatraceexporter/internal/serialization/histogram_test.go @@ -276,4 +276,52 @@ func Test_serializeHistogram(t *testing.T) { assert.Equal(t, 5.0, max, "use the estimated boundary") }) }) + + t.Run("when sum is not provided it should be estimated", func(t *testing.T) { + t.Run("single bucket histogram", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{}) + hist.SetMBucketCounts([]uint64{13}) + hist.SetCount(6) + + _, _, sum := histDataPointToSummary(hist) + + assert.Equal(t, 0.0, sum, "estimate zero (midpoint of [-Inf, Inf])") + }) + + t.Run("data in bounded buckets", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5}) + hist.SetMBucketCounts([]uint64{0, 3, 5, 0, 0, 0}) + hist.SetCount(6) + + _, _, sum := histDataPointToSummary(hist) + + assert.Equal(t, 3*1.5+5*2.5, sum, "estimate sum using bucket midpoints") + }) + + t.Run("data in unbounded buckets", func(t *testing.T) { + t.Run("first bucket", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5}) + hist.SetMBucketCounts([]uint64{2, 3, 5, 0, 0, 0}) + hist.SetCount(6) + + _, _, sum := histDataPointToSummary(hist) + + assert.Equal(t, 1*2+3*1.5+5*2.5, sum, "use bucket upper bound") + }) + + t.Run("last bucket", func(t *testing.T) { + hist := pmetric.NewHistogramDataPoint() + hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5}) + hist.SetMBucketCounts([]uint64{0, 3, 5, 0, 0, 2}) + hist.SetCount(6) + + _, _, sum := histDataPointToSummary(hist) + + assert.Equal(t, 3*1.5+5*2.5+2*5, sum, "use bucket upper bound") + }) + }) + }) } From 5b77183e5c0c38c06bce22105cf99bf2ea322fc8 Mon Sep 17 00:00:00 2001 From: Daniel Dyla Date: Thu, 16 Jun 2022 10:58:55 -0400 Subject: [PATCH 5/6] Clarification comments --- .../internal/serialization/histogram.go | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/exporter/dynatraceexporter/internal/serialization/histogram.go b/exporter/dynatraceexporter/internal/serialization/histogram.go index 1d1509631504..d73e91881a3e 100644 --- a/exporter/dynatraceexporter/internal/serialization/histogram.go +++ b/exporter/dynatraceexporter/internal/serialization/histogram.go @@ -68,6 +68,11 @@ func histDataPointToSummary(dp pmetric.HistogramDataPoint) (float64, float64, fl return estimateSingleBucketHistogram(dp) } + // If any of min, max, sum is not provided in the data point, + // loop through the buckets to estimate them. + // All three values are estimated in order to avoid looping multiple times + // or complicating the loop with branches. After the loop, estimates + // will be overridden with any values provided by the data point. foundNonEmptyBucket := false var min, max, sum float64 = 0, 0, 0 @@ -78,7 +83,7 @@ func histDataPointToSummary(dp pmetric.HistogramDataPoint) (float64, float64, fl continue } - // range for counts[i] is bounds[i-1] to bounds[i] + // range for bucket counts[i] is bounds[i-1] to bounds[i] // min estimation if !foundNonEmptyBucket { @@ -91,6 +96,7 @@ func histDataPointToSummary(dp pmetric.HistogramDataPoint) (float64, float64, fl } } + // max estimation if i == len(counts)-1 { // if we're in the last bucket, the best estimate we can make for max is the lower bound max = bounds[i-1] @@ -98,6 +104,7 @@ func histDataPointToSummary(dp pmetric.HistogramDataPoint) (float64, float64, fl max = bounds[i] } + // sum estimation switch i { case 0: // in the first bucket, estimate sum using the upper bound @@ -111,6 +118,7 @@ func histDataPointToSummary(dp pmetric.HistogramDataPoint) (float64, float64, fl } } + // Override estimates with any values provided by the data point if dp.HasMin() { min = dp.Min() } From f66a42129f5c77fae8515efedc3a97e6173185ba Mon Sep 17 00:00:00 2001 From: Daniel Dyla Date: Thu, 16 Jun 2022 11:01:19 -0400 Subject: [PATCH 6/6] dynatraceexporter: Clarify count may not be zero for histogram estimation --- exporter/dynatraceexporter/internal/serialization/histogram.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/exporter/dynatraceexporter/internal/serialization/histogram.go b/exporter/dynatraceexporter/internal/serialization/histogram.go index d73e91881a3e..a3b8b4c699bc 100644 --- a/exporter/dynatraceexporter/internal/serialization/histogram.go +++ b/exporter/dynatraceexporter/internal/serialization/histogram.go @@ -54,6 +54,7 @@ func serializeHistogram(name, prefix string, dims dimensions.NormalizedDimension } // histDataPointToSummary returns the estimated minimum and maximum value in the histogram by using the min and max non-empty buckets. +// It MAY NOT be called with a data point with dp.Count() == 0. func histDataPointToSummary(dp pmetric.HistogramDataPoint) (float64, float64, float64) { bounds := dp.MExplicitBounds() counts := dp.MBucketCounts() @@ -131,6 +132,7 @@ func histDataPointToSummary(dp pmetric.HistogramDataPoint) (float64, float64, fl // Set min to average when higher than average. This can happen when most values are lower than first boundary (falling in first bucket). // Set max to average when lower than average. This can happen when most values are higher than last boundary (falling in last bucket). + // dp.Count() will never be zero avg := sum / float64(dp.Count()) if min > avg { min = avg