From 3735ddaa7793461d3568b3b36e9737ba2b9de7b6 Mon Sep 17 00:00:00 2001
From: Daniel Dyla <dyladan@users.noreply.github.com>
Date: Tue, 14 Jun 2022 18:31:43 -0400
Subject: [PATCH 1/6] feat(dynatraceexporter): provide better estimated
 summaries for partial histograms

---
 .../internal/serialization/histogram.go       | 119 +++++++++-----
 .../internal/serialization/histogram_test.go  | 145 ++++++++++++++++++
 2 files changed, 223 insertions(+), 41 deletions(-)

diff --git a/exporter/dynatraceexporter/internal/serialization/histogram.go b/exporter/dynatraceexporter/internal/serialization/histogram.go
index f543e6d43b7b..d63de0da4fdd 100644
--- a/exporter/dynatraceexporter/internal/serialization/histogram.go
+++ b/exporter/dynatraceexporter/internal/serialization/histogram.go
@@ -36,19 +36,14 @@ func serializeHistogram(name, prefix string, dims dimensions.NormalizedDimension
 		return "", nil
 	}
 
-	min, max := estimateHistMinMax(dp)
+	min, max, sum := histDataPointToSummary(dp)
 
 	dm, err := dtMetric.NewMetric(
 		name,
 		dtMetric.WithPrefix(prefix),
 		dtMetric.WithDimensions(dims),
 		dtMetric.WithTimestamp(dp.Timestamp().AsTime()),
-		dtMetric.WithFloatSummaryValue(
-			min,
-			max,
-			dp.Sum(),
-			int64(dp.Count()),
-		),
+		dtMetric.WithFloatSummaryValue(min, max, sum, int64(dp.Count())),
 	)
 
 	if err != nil {
@@ -58,58 +53,76 @@ func serializeHistogram(name, prefix string, dims dimensions.NormalizedDimension
 	return dm.Serialize()
 }
 
-// estimateHistMinMax returns the estimated minimum and maximum value in the histogram by using the min and max non-empty buckets.
-func estimateHistMinMax(dp pmetric.HistogramDataPoint) (float64, float64) {
+// histDataPointToSummary returns the estimated minimum and maximum value in the histogram by using the min and max non-empty buckets.
+func histDataPointToSummary(dp pmetric.HistogramDataPoint) (float64, float64, float64) {
 	bounds := dp.MExplicitBounds()
 	counts := dp.MBucketCounts()
 
-	// shortcut in the case both max and min are provided
-	if dp.HasMin() && dp.HasMax() {
-		return dp.Min(), dp.Max()
+	// shortcut if min, max, and sum are provided
+	if dp.HasMin() && dp.HasMax() && dp.HasSum() {
+		return dp.Min(), dp.Max(), dp.Sum()
 	}
 
-	// Because we do not know the actual min and max, we estimate them based on the min and max non-empty bucket
-	minIdx, maxIdx := -1, -1
-	for y := 0; y < len(counts); y++ {
-		if counts[y] > 0 {
-			if minIdx == -1 {
-				minIdx = y
-			}
-			maxIdx = y
-		}
+	// a single-bucket histogram is a special case
+	if len(counts) == 1 {
+		return estimateSingleBucketHistogram(dp)
 	}
 
-	if minIdx == -1 || maxIdx == -1 {
-		return 0, 0
-	}
+	foundNonEmptyBucket := false
+	var min, max, sum float64 = 0, 0, 0
 
-	var min, max float64
+	// Because we do not know the actual min, max, or sum, we estimate them based on non-empty buckets
+	for i := 0; i < len(counts); i++ {
+		// empty bucket
+		if counts[i] == 0 {
+			continue
+		}
 
-	if dp.HasMin() {
-		min = dp.Min()
-	} else {
-		// Use lower bound for min unless it is the first bucket which has no lower bound, then use upper
-		if minIdx == 0 {
-			min = bounds[minIdx]
+		// range for counts[i] is bounds[i-1] to bounds[i]
+
+		// min estimation
+		if !foundNonEmptyBucket {
+			foundNonEmptyBucket = true
+			if i == 0 {
+				// if we're in the first bucket, the best estimate we can make for min is the upper bound
+				min = bounds[i]
+			} else {
+				min = bounds[i-1]
+			}
+		}
+
+		if i == len(counts)-1 {
+			// if we're in the last bucket, the best estimate we can make for max is the lower bound
+			max = bounds[i-1]
 		} else {
-			min = bounds[minIdx-1]
+			max = bounds[i]
+		}
+
+		if i == 0 {
+			// in the first bucket, estimate sum using the upper bound
+			sum += float64(counts[i]) * bounds[i]
+		} else if i == len(counts)-1 {
+			// in the last bucket, estimate sum using the lower bound
+			sum += float64(counts[i]) * bounds[i-1]
+		} else {
+			// in any other bucket, estimate sum using the bucket midpoint
+			sum += float64(counts[i]) * (bounds[i] + bounds[i-1]) / 2
 		}
 	}
 
+	if dp.HasMin() {
+		min = dp.Min()
+	}
 	if dp.HasMax() {
 		max = dp.Max()
-	} else {
-		// Use upper bound for max unless it is the last bucket which has no upper bound, then use lower
-		if maxIdx == len(counts)-1 {
-			max = bounds[maxIdx-1]
-		} else {
-			max = bounds[maxIdx]
-		}
+	}
+	if dp.HasSum() {
+		sum = dp.Sum()
 	}
 
 	// Set min to average when higher than average. This can happen when most values are lower than first boundary (falling in first bucket).
 	// Set max to average when lower than average. This can happen when most values are higher than last boundary (falling in last bucket).
-	avg := dp.Sum() / float64(dp.Count())
+	avg := sum / float64(dp.Count())
 	if min > avg {
 		min = avg
 	}
@@ -117,5 +130,29 @@ func estimateHistMinMax(dp pmetric.HistogramDataPoint) (float64, float64) {
 		max = avg
 	}
 
-	return min, max
+	return min, max, sum
+}
+
+func estimateSingleBucketHistogram(dp pmetric.HistogramDataPoint) (float64, float64, float64) {
+	min, max, sum := 0.0, 0.0, 0.0
+
+	if dp.HasSum() {
+		sum = dp.Sum()
+	}
+
+	mean := sum / float64(dp.Count())
+
+	if dp.HasMin() {
+		min = dp.Min()
+	} else {
+		min = mean
+	}
+
+	if dp.HasMax() {
+		max = dp.Max()
+	} else {
+		max = mean
+	}
+
+	return min, max, sum
 }
diff --git a/exporter/dynatraceexporter/internal/serialization/histogram_test.go b/exporter/dynatraceexporter/internal/serialization/histogram_test.go
index 562e5e71f621..2f44c1ccad6e 100644
--- a/exporter/dynatraceexporter/internal/serialization/histogram_test.go
+++ b/exporter/dynatraceexporter/internal/serialization/histogram_test.go
@@ -131,4 +131,149 @@ func Test_serializeHistogram(t *testing.T) {
 		assert.NoError(t, err)
 		assert.Equal(t, "prefix.min_max_hist gauge,min=3,max=7,sum=10,count=2 1626438600000", got)
 	})
+
+	t.Run("when min is not provided it should be estimated", func(t *testing.T) {
+		t.Run("values between first two boundaries", func(t *testing.T) {
+			hist := pmetric.NewHistogramDataPoint()
+			hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
+			hist.SetMBucketCounts([]uint64{0, 1, 0, 3, 2, 0})
+			hist.SetCount(6)
+			hist.SetSum(21.2)
+
+			min, _, _ := histDataPointToSummary(hist)
+
+			assert.Equal(t, 1.0, min, "use bucket min")
+		})
+
+		t.Run("first bucket has value", func(t *testing.T) {
+			hist := pmetric.NewHistogramDataPoint()
+			hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
+			hist.SetMBucketCounts([]uint64{1, 0, 0, 3, 0, 4})
+			hist.SetCount(8)
+			hist.SetSum(34.5)
+
+			min, _, _ := histDataPointToSummary(hist)
+
+			assert.Equal(t, 1.0, min, "use the first boundary as estimation instead of Inf")
+		})
+
+		t.Run("only the first bucket has values, use the mean", func(t *testing.T) {
+			hist := pmetric.NewHistogramDataPoint()
+			hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
+			hist.SetMBucketCounts([]uint64{3, 0, 0, 0, 0, 0})
+			hist.SetCount(3)
+			hist.SetSum(0.75)
+
+			min, _, _ := histDataPointToSummary(hist)
+
+			assert.Equal(t, 0.25, min)
+		})
+		t.Run("just one bucket from -Inf to Inf", func(t *testing.T) {
+			hist := pmetric.NewHistogramDataPoint()
+			hist.SetMExplicitBounds([]float64{})
+			hist.SetMBucketCounts([]uint64{4})
+			hist.SetCount(4)
+			hist.SetSum(8.8)
+
+			min, _, _ := histDataPointToSummary(hist)
+
+			assert.Equal(t, 2.2, min, "calculate the mean as min value")
+		})
+		t.Run("just one bucket from -Inf to Inf", func(t *testing.T) {
+			hist := pmetric.NewHistogramDataPoint()
+			hist.SetMExplicitBounds([]float64{})
+			hist.SetMBucketCounts([]uint64{1})
+			hist.SetCount(1)
+			hist.SetSum(1.2)
+
+			min, _, _ := histDataPointToSummary(hist)
+
+			assert.Equal(t, 1.2, min, "calculate the mean as min value")
+		})
+		t.Run("only the last bucket has a value", func(t *testing.T) {
+			hist := pmetric.NewHistogramDataPoint()
+			hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
+			hist.SetMBucketCounts([]uint64{0, 0, 0, 0, 0, 3})
+			hist.SetCount(3)
+			hist.SetSum(15.6)
+
+			min, _, _ := histDataPointToSummary(hist)
+
+			assert.Equal(t, 5.0, min, "use the lower bound")
+		})
+	})
+
+	t.Run("when max is not provided it should be estimated", func(t *testing.T) {
+		t.Run("values between the last two boundaries", func(t *testing.T) {
+			hist := pmetric.NewHistogramDataPoint()
+			hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
+			hist.SetMBucketCounts([]uint64{0, 1, 0, 3, 2, 0})
+			hist.SetSum(21.2)
+			hist.SetCount(6)
+
+			_, max, _ := histDataPointToSummary(hist)
+
+			assert.Equal(t, 5.0, max, "use bucket max")
+		})
+
+		t.Run("last bucket has value", func(t *testing.T) {
+			hist := pmetric.NewHistogramDataPoint()
+			hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
+			hist.SetMBucketCounts([]uint64{1, 0, 0, 3, 0, 4})
+			hist.SetSum(34.5)
+			hist.SetCount(8)
+
+			_, max, _ := histDataPointToSummary(hist)
+
+			assert.Equal(t, 5.0, max, "use the last boundary as estimation instead of Inf")
+		})
+
+		t.Run("only the last bucket has values", func(t *testing.T) {
+			hist := pmetric.NewHistogramDataPoint()
+			hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
+			hist.SetMBucketCounts([]uint64{0, 0, 0, 0, 0, 2})
+			hist.SetSum(20.2)
+			hist.SetCount(2)
+
+			_, max, _ := histDataPointToSummary(hist)
+
+			assert.Equal(t, 10.1, max, "use the mean (10.1) Otherwise, the max would be estimated as 5, and max >= avg would be violated")
+		})
+
+		t.Run("just one bucket from -Inf to Inf", func(t *testing.T) {
+			hist := pmetric.NewHistogramDataPoint()
+			hist.SetMExplicitBounds([]float64{})
+			hist.SetMBucketCounts([]uint64{4})
+			hist.SetSum(8.8)
+			hist.SetCount(4)
+
+			_, max, _ := histDataPointToSummary(hist)
+
+			assert.Equal(t, 2.2, max, "calculate the mean as max value")
+		})
+
+		t.Run("just one bucket from -Inf to Inf", func(t *testing.T) {
+			hist := pmetric.NewHistogramDataPoint()
+			hist.SetMExplicitBounds([]float64{})
+			hist.SetMBucketCounts([]uint64{1})
+			hist.SetSum(1.2)
+			hist.SetCount(1)
+
+			_, max, _ := histDataPointToSummary(hist)
+
+			assert.Equal(t, 1.2, max, "calculate the mean as max value")
+		})
+
+		t.Run("max is larger than sum", func(t *testing.T) {
+			hist := pmetric.NewHistogramDataPoint()
+			hist.SetMExplicitBounds([]float64{0, 5})
+			hist.SetMBucketCounts([]uint64{0, 2, 0})
+			hist.SetSum(2.3)
+			hist.SetCount(2)
+
+			_, max, _ := histDataPointToSummary(hist)
+
+			assert.Equal(t, 5.0, max, "use the estimated boundary")
+		})
+	})
 }

From bdd131be190934beedca66329cfd0b5a562df699 Mon Sep 17 00:00:00 2001
From: Daniel Dyla <dyladan@users.noreply.github.com>
Date: Tue, 14 Jun 2022 18:36:00 -0400
Subject: [PATCH 2/6] Update changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 298d2decb9cd..948aa182f166 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -27,6 +27,7 @@
 - `examples`: Add an example for scraping Couchbase metrics (#10894)
 - `filestorageextension`: Add background compaction capability (#9327)
 - `googlecloudpubsubreceiver`: Added new `Endpoint` and `Insecure` connection configuration options. (#10845)
+- `dynatraceexporter`: Provide better estimated summaries for partial histograms. (#11044)
 
 ### 🧰 Bug fixes 🧰
 

From 7305c8d3fad7699df011e33f4f69feb7cee472d0 Mon Sep 17 00:00:00 2001
From: Daniel Dyla <dyladan@users.noreply.github.com>
Date: Wed, 15 Jun 2022 08:12:34 -0400
Subject: [PATCH 3/6] Lint

---
 .../dynatraceexporter/internal/serialization/histogram.go  | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/exporter/dynatraceexporter/internal/serialization/histogram.go b/exporter/dynatraceexporter/internal/serialization/histogram.go
index d63de0da4fdd..1d1509631504 100644
--- a/exporter/dynatraceexporter/internal/serialization/histogram.go
+++ b/exporter/dynatraceexporter/internal/serialization/histogram.go
@@ -98,13 +98,14 @@ func histDataPointToSummary(dp pmetric.HistogramDataPoint) (float64, float64, fl
 			max = bounds[i]
 		}
 
-		if i == 0 {
+		switch i {
+		case 0:
 			// in the first bucket, estimate sum using the upper bound
 			sum += float64(counts[i]) * bounds[i]
-		} else if i == len(counts)-1 {
+		case len(counts) - 1:
 			// in the last bucket, estimate sum using the lower bound
 			sum += float64(counts[i]) * bounds[i-1]
-		} else {
+		default:
 			// in any other bucket, estimate sum using the bucket midpoint
 			sum += float64(counts[i]) * (bounds[i] + bounds[i-1]) / 2
 		}

From 6b377d14a3b3f1a480b1c7840cd22d7d8b6d207e Mon Sep 17 00:00:00 2001
From: Daniel Dyla <dyladan@users.noreply.github.com>
Date: Thu, 16 Jun 2022 10:53:38 -0400
Subject: [PATCH 4/6] dynatraceexporter: test sum estimation

---
 .../internal/serialization/histogram_test.go  | 48 +++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/exporter/dynatraceexporter/internal/serialization/histogram_test.go b/exporter/dynatraceexporter/internal/serialization/histogram_test.go
index 2f44c1ccad6e..d076af2f6695 100644
--- a/exporter/dynatraceexporter/internal/serialization/histogram_test.go
+++ b/exporter/dynatraceexporter/internal/serialization/histogram_test.go
@@ -276,4 +276,52 @@ func Test_serializeHistogram(t *testing.T) {
 			assert.Equal(t, 5.0, max, "use the estimated boundary")
 		})
 	})
+
+	t.Run("when sum is not provided it should be estimated", func(t *testing.T) {
+		t.Run("single bucket histogram", func(t *testing.T) {
+			hist := pmetric.NewHistogramDataPoint()
+			hist.SetMExplicitBounds([]float64{})
+			hist.SetMBucketCounts([]uint64{13})
+			hist.SetCount(6)
+
+			_, _, sum := histDataPointToSummary(hist)
+
+			assert.Equal(t, 0.0, sum, "estimate zero (midpoint of [-Inf, Inf])")
+		})
+
+		t.Run("data in bounded buckets", func(t *testing.T) {
+			hist := pmetric.NewHistogramDataPoint()
+			hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
+			hist.SetMBucketCounts([]uint64{0, 3, 5, 0, 0, 0})
+			hist.SetCount(6)
+
+			_, _, sum := histDataPointToSummary(hist)
+
+			assert.Equal(t, 3*1.5+5*2.5, sum, "estimate sum using bucket midpoints")
+		})
+
+		t.Run("data in unbounded buckets", func(t *testing.T) {
+			t.Run("first bucket", func(t *testing.T) {
+				hist := pmetric.NewHistogramDataPoint()
+				hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
+				hist.SetMBucketCounts([]uint64{2, 3, 5, 0, 0, 0})
+				hist.SetCount(6)
+
+				_, _, sum := histDataPointToSummary(hist)
+
+				assert.Equal(t, 1*2+3*1.5+5*2.5, sum, "use bucket upper bound")
+			})
+
+			t.Run("last bucket", func(t *testing.T) {
+				hist := pmetric.NewHistogramDataPoint()
+				hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
+				hist.SetMBucketCounts([]uint64{0, 3, 5, 0, 0, 2})
+				hist.SetCount(6)
+
+				_, _, sum := histDataPointToSummary(hist)
+
+				assert.Equal(t, 3*1.5+5*2.5+2*5, sum, "use bucket upper bound")
+			})
+		})
+	})
 }

From 5b77183e5c0c38c06bce22105cf99bf2ea322fc8 Mon Sep 17 00:00:00 2001
From: Daniel Dyla <dyladan@users.noreply.github.com>
Date: Thu, 16 Jun 2022 10:58:55 -0400
Subject: [PATCH 5/6] Clarification comments

---
 .../internal/serialization/histogram.go                | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/exporter/dynatraceexporter/internal/serialization/histogram.go b/exporter/dynatraceexporter/internal/serialization/histogram.go
index 1d1509631504..d73e91881a3e 100644
--- a/exporter/dynatraceexporter/internal/serialization/histogram.go
+++ b/exporter/dynatraceexporter/internal/serialization/histogram.go
@@ -68,6 +68,11 @@ func histDataPointToSummary(dp pmetric.HistogramDataPoint) (float64, float64, fl
 		return estimateSingleBucketHistogram(dp)
 	}
 
+	// If any of min, max, sum is not provided in the data point,
+	// loop through the buckets to estimate them.
+	// All three values are estimated in order to avoid looping multiple times
+	// or complicating the loop with branches. After the loop, estimates
+	// will be overridden with any values provided by the data point.
 	foundNonEmptyBucket := false
 	var min, max, sum float64 = 0, 0, 0
 
@@ -78,7 +83,7 @@ func histDataPointToSummary(dp pmetric.HistogramDataPoint) (float64, float64, fl
 			continue
 		}
 
-		// range for counts[i] is bounds[i-1] to bounds[i]
+		// range for bucket counts[i] is bounds[i-1] to bounds[i]
 
 		// min estimation
 		if !foundNonEmptyBucket {
@@ -91,6 +96,7 @@ func histDataPointToSummary(dp pmetric.HistogramDataPoint) (float64, float64, fl
 			}
 		}
 
+		// max estimation
 		if i == len(counts)-1 {
 			// if we're in the last bucket, the best estimate we can make for max is the lower bound
 			max = bounds[i-1]
@@ -98,6 +104,7 @@ func histDataPointToSummary(dp pmetric.HistogramDataPoint) (float64, float64, fl
 			max = bounds[i]
 		}
 
+		// sum estimation
 		switch i {
 		case 0:
 			// in the first bucket, estimate sum using the upper bound
@@ -111,6 +118,7 @@ func histDataPointToSummary(dp pmetric.HistogramDataPoint) (float64, float64, fl
 		}
 	}
 
+	// Override estimates with any values provided by the data point
 	if dp.HasMin() {
 		min = dp.Min()
 	}

From f66a42129f5c77fae8515efedc3a97e6173185ba Mon Sep 17 00:00:00 2001
From: Daniel Dyla <dyladan@users.noreply.github.com>
Date: Thu, 16 Jun 2022 11:01:19 -0400
Subject: [PATCH 6/6] dynatraceexporter: Clarify count may not be zero for
 histogram estimation

---
 exporter/dynatraceexporter/internal/serialization/histogram.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/exporter/dynatraceexporter/internal/serialization/histogram.go b/exporter/dynatraceexporter/internal/serialization/histogram.go
index d73e91881a3e..a3b8b4c699bc 100644
--- a/exporter/dynatraceexporter/internal/serialization/histogram.go
+++ b/exporter/dynatraceexporter/internal/serialization/histogram.go
@@ -54,6 +54,7 @@ func serializeHistogram(name, prefix string, dims dimensions.NormalizedDimension
 }
 
 // histDataPointToSummary returns the estimated minimum and maximum value in the histogram by using the min and max non-empty buckets.
+// It MAY NOT be called with a data point with dp.Count() == 0.
 func histDataPointToSummary(dp pmetric.HistogramDataPoint) (float64, float64, float64) {
 	bounds := dp.MExplicitBounds()
 	counts := dp.MBucketCounts()
@@ -131,6 +132,7 @@ func histDataPointToSummary(dp pmetric.HistogramDataPoint) (float64, float64, fl
 
 	// Set min to average when higher than average. This can happen when most values are lower than first boundary (falling in first bucket).
 	// Set max to average when lower than average. This can happen when most values are higher than last boundary (falling in last bucket).
+	// dp.Count() will never be zero
 	avg := sum / float64(dp.Count())
 	if min > avg {
 		min = avg