Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(dynatraceexporter): provide better estimated summaries for partial histograms #11044

Merged
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
- `examples`: Add an example for scraping Couchbase metrics (#10894)
- `filestorageextension`: Add background compaction capability (#9327)
- `googlecloudpubsubreceiver`: Added new `Endpoint` and `Insecure` connection configuration options. (#10845)
- `dynatraceexporter`: Provide better estimated summaries for partial histograms. (#11044)

### 🧰 Bug fixes 🧰

Expand Down
120 changes: 79 additions & 41 deletions exporter/dynatraceexporter/internal/serialization/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,14 @@ func serializeHistogram(name, prefix string, dims dimensions.NormalizedDimension
return "", nil
}

min, max := estimateHistMinMax(dp)
min, max, sum := histDataPointToSummary(dp)

dm, err := dtMetric.NewMetric(
name,
dtMetric.WithPrefix(prefix),
dtMetric.WithDimensions(dims),
dtMetric.WithTimestamp(dp.Timestamp().AsTime()),
dtMetric.WithFloatSummaryValue(
min,
max,
dp.Sum(),
int64(dp.Count()),
),
dtMetric.WithFloatSummaryValue(min, max, sum, int64(dp.Count())),
)

if err != nil {
Expand All @@ -58,64 +53,107 @@ func serializeHistogram(name, prefix string, dims dimensions.NormalizedDimension
return dm.Serialize()
}

// estimateHistMinMax returns the estimated minimum and maximum value in the histogram by using the min and max non-empty buckets.
func estimateHistMinMax(dp pmetric.HistogramDataPoint) (float64, float64) {
// histDataPointToSummary returns the estimated minimum and maximum value in the histogram by using the min and max non-empty buckets.
func histDataPointToSummary(dp pmetric.HistogramDataPoint) (float64, float64, float64) {
bounds := dp.MExplicitBounds()
counts := dp.MBucketCounts()

// shortcut in the case both max and min are provided
if dp.HasMin() && dp.HasMax() {
return dp.Min(), dp.Max()
// shortcut if min, max, and sum are provided
if dp.HasMin() && dp.HasMax() && dp.HasSum() {
return dp.Min(), dp.Max(), dp.Sum()
}

// Because we do not know the actual min and max, we estimate them based on the min and max non-empty bucket
minIdx, maxIdx := -1, -1
for y := 0; y < len(counts); y++ {
if counts[y] > 0 {
if minIdx == -1 {
minIdx = y
}
maxIdx = y
}
// a single-bucket histogram is a special case
if len(counts) == 1 {
return estimateSingleBucketHistogram(dp)
}

if minIdx == -1 || maxIdx == -1 {
return 0, 0
}
foundNonEmptyBucket := false
var min, max, sum float64 = 0, 0, 0

var min, max float64
// Because we do not know the actual min, max, or sum, we estimate them based on non-empty buckets
for i := 0; i < len(counts); i++ {
// empty bucket
if counts[i] == 0 {
continue
}

if dp.HasMin() {
min = dp.Min()
} else {
// Use lower bound for min unless it is the first bucket which has no lower bound, then use upper
if minIdx == 0 {
min = bounds[minIdx]
// range for counts[i] is bounds[i-1] to bounds[i]

// min estimation
if !foundNonEmptyBucket {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hum.. not sure I get the motive of this var here? Couldn't we just check if it's i == 0 it will only go inside once..🤔

Copy link
Member Author

@dyladan dyladan Jun 16, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The first non-empty bucket might not be the first bucket.

foundNonEmptyBucket = true
if i == 0 {
// if we're in the first bucket, the best estimate we can make for min is the upper bound
min = bounds[i]
} else {
min = bounds[i-1]
}
}

if i == len(counts)-1 {
dyladan marked this conversation as resolved.
Show resolved Hide resolved
// if we're in the last bucket, the best estimate we can make for max is the lower bound
max = bounds[i-1]
} else {
min = bounds[minIdx-1]
max = bounds[i]
}

switch i {
case 0:
// in the first bucket, estimate sum using the upper bound
sum += float64(counts[i]) * bounds[i]
case len(counts) - 1:
// in the last bucket, estimate sum using the lower bound
sum += float64(counts[i]) * bounds[i-1]
default:
// in any other bucket, estimate sum using the bucket midpoint
sum += float64(counts[i]) * (bounds[i] + bounds[i-1]) / 2
}
}

if dp.HasMin() {
dyladan marked this conversation as resolved.
Show resolved Hide resolved
min = dp.Min()
}
if dp.HasMax() {
max = dp.Max()
} else {
// Use upper bound for max unless it is the last bucket which has no upper bound, then use lower
if maxIdx == len(counts)-1 {
max = bounds[maxIdx-1]
} else {
max = bounds[maxIdx]
}
}
if dp.HasSum() {
sum = dp.Sum()
}

// Set min to average when higher than average. This can happen when most values are lower than first boundary (falling in first bucket).
// Set max to average when lower than average. This can happen when most values are higher than last boundary (falling in last bucket).
avg := dp.Sum() / float64(dp.Count())
avg := sum / float64(dp.Count())
if min > avg {
min = avg
}
if max < avg {
max = avg
}

return min, max
return min, max, sum
}

func estimateSingleBucketHistogram(dp pmetric.HistogramDataPoint) (float64, float64, float64) {
min, max, sum := 0.0, 0.0, 0.0

if dp.HasSum() {
sum = dp.Sum()
}

mean := sum / float64(dp.Count())
dyladan marked this conversation as resolved.
Show resolved Hide resolved
dyladan marked this conversation as resolved.
Show resolved Hide resolved

if dp.HasMin() {
min = dp.Min()
} else {
min = mean
}

if dp.HasMax() {
max = dp.Max()
} else {
max = mean
}

return min, max, sum
}
145 changes: 145 additions & 0 deletions exporter/dynatraceexporter/internal/serialization/histogram_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,4 +131,149 @@ func Test_serializeHistogram(t *testing.T) {
assert.NoError(t, err)
assert.Equal(t, "prefix.min_max_hist gauge,min=3,max=7,sum=10,count=2 1626438600000", got)
})

dyladan marked this conversation as resolved.
Show resolved Hide resolved
t.Run("when min is not provided it should be estimated", func(t *testing.T) {
t.Run("values between first two boundaries", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
hist.SetMBucketCounts([]uint64{0, 1, 0, 3, 2, 0})
hist.SetCount(6)
hist.SetSum(21.2)

min, _, _ := histDataPointToSummary(hist)

assert.Equal(t, 1.0, min, "use bucket min")
})

t.Run("first bucket has value", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
hist.SetMBucketCounts([]uint64{1, 0, 0, 3, 0, 4})
hist.SetCount(8)
hist.SetSum(34.5)

min, _, _ := histDataPointToSummary(hist)

assert.Equal(t, 1.0, min, "use the first boundary as estimation instead of Inf")
})

t.Run("only the first bucket has values, use the mean", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
hist.SetMBucketCounts([]uint64{3, 0, 0, 0, 0, 0})
hist.SetCount(3)
hist.SetSum(0.75)

min, _, _ := histDataPointToSummary(hist)

assert.Equal(t, 0.25, min)
})
t.Run("just one bucket from -Inf to Inf", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{})
hist.SetMBucketCounts([]uint64{4})
hist.SetCount(4)
hist.SetSum(8.8)

min, _, _ := histDataPointToSummary(hist)

assert.Equal(t, 2.2, min, "calculate the mean as min value")
})
t.Run("just one bucket from -Inf to Inf", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{})
hist.SetMBucketCounts([]uint64{1})
hist.SetCount(1)
hist.SetSum(1.2)

min, _, _ := histDataPointToSummary(hist)

assert.Equal(t, 1.2, min, "calculate the mean as min value")
})
t.Run("only the last bucket has a value", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
hist.SetMBucketCounts([]uint64{0, 0, 0, 0, 0, 3})
hist.SetCount(3)
hist.SetSum(15.6)

min, _, _ := histDataPointToSummary(hist)

assert.Equal(t, 5.0, min, "use the lower bound")
})
})

t.Run("when max is not provided it should be estimated", func(t *testing.T) {
t.Run("values between the last two boundaries", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
hist.SetMBucketCounts([]uint64{0, 1, 0, 3, 2, 0})
hist.SetSum(21.2)
hist.SetCount(6)

_, max, _ := histDataPointToSummary(hist)

assert.Equal(t, 5.0, max, "use bucket max")
})

t.Run("last bucket has value", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
hist.SetMBucketCounts([]uint64{1, 0, 0, 3, 0, 4})
hist.SetSum(34.5)
hist.SetCount(8)

_, max, _ := histDataPointToSummary(hist)

assert.Equal(t, 5.0, max, "use the last boundary as estimation instead of Inf")
})

t.Run("only the last bucket has values", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{1, 2, 3, 4, 5})
hist.SetMBucketCounts([]uint64{0, 0, 0, 0, 0, 2})
hist.SetSum(20.2)
hist.SetCount(2)

_, max, _ := histDataPointToSummary(hist)

assert.Equal(t, 10.1, max, "use the mean (10.1) Otherwise, the max would be estimated as 5, and max >= avg would be violated")
})

t.Run("just one bucket from -Inf to Inf", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{})
hist.SetMBucketCounts([]uint64{4})
hist.SetSum(8.8)
hist.SetCount(4)

_, max, _ := histDataPointToSummary(hist)

assert.Equal(t, 2.2, max, "calculate the mean as max value")
})

t.Run("just one bucket from -Inf to Inf", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{})
hist.SetMBucketCounts([]uint64{1})
hist.SetSum(1.2)
hist.SetCount(1)

_, max, _ := histDataPointToSummary(hist)

assert.Equal(t, 1.2, max, "calculate the mean as max value")
})

t.Run("max is larger than sum", func(t *testing.T) {
hist := pmetric.NewHistogramDataPoint()
hist.SetMExplicitBounds([]float64{0, 5})
hist.SetMBucketCounts([]uint64{0, 2, 0})
hist.SetSum(2.3)
hist.SetCount(2)

_, max, _ := histDataPointToSummary(hist)

assert.Equal(t, 5.0, max, "use the estimated boundary")
})
})
}