Skip to content

Commit

Permalink
refactor(bigtable): Refactoring client side metrics code (#10623)
Browse files Browse the repository at this point in the history
  • Loading branch information
bhshkh authored Aug 1, 2024
1 parent 47941b3 commit 0e5c5c9
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 21 deletions.
29 changes: 12 additions & 17 deletions bigtable/bigtable.go
Original file line number Diff line number Diff line change
Expand Up @@ -1531,28 +1531,25 @@ func (t *Table) newBuiltinMetricsTracer(ctx context.Context, isStreaming bool) *
}

// recordOperationCompletion records as many operation specific metrics as it can
// Ignores error seen while creating metric attributes since metric can still
// be recorded with rest of the attributes
func recordOperationCompletion(mt *builtinMetricsTracer) {
if !mt.builtInEnabled {
return
}

// Calculate elapsed time
elapsedTimeMs := float64(time.Since(mt.currOp.startTime).Nanoseconds()) / 1000000
elapsedTimeMs := convertToMs(time.Since(mt.currOp.startTime))

// Attributes for operation_latencies
// Ignore error seen while creating metric attributes since metric can still
// be recorded with rest of the attributes
// Record operation_latencies
opLatAttrs, _ := mt.toOtelMetricAttrs(metricNameOperationLatencies)
mt.instrumentOperationLatencies.Record(mt.ctx, elapsedTimeMs, metric.WithAttributes(opLatAttrs...))

// Attributes for retry_count
// Ignore error seen while creating metric attributes since metric can still
// be recorded with rest of the attributes
// Record retry_count
retryCntAttrs, _ := mt.toOtelMetricAttrs(metricNameRetryCount)

// Only record when retry count is greater than 0 so the retry
// graph will be less confusing
if mt.currOp.attemptCount > 1 {
// Only record when retry count is greater than 0 so the retry
// graph will be less confusing
mt.instrumentRetryCount.Add(mt.ctx, mt.currOp.attemptCount-1, metric.WithAttributes(retryCntAttrs...))
}
}
Expand Down Expand Up @@ -1604,23 +1601,21 @@ func gaxInvokeWithRecorder(ctx context.Context, mt *builtinMetricsTracer, method
}

// recordAttemptCompletion records as many attempt specific metrics as it can
// Ignore errors seen while creating metric attributes since metric can still
// be recorded with rest of the attributes
func recordAttemptCompletion(mt *builtinMetricsTracer) {
if !mt.builtInEnabled {
return
}

// Calculate elapsed time
elapsedTime := float64(time.Since(mt.currOp.currAttempt.startTime).Nanoseconds()) / 1000000
elapsedTime := convertToMs(time.Since(mt.currOp.currAttempt.startTime))

// Attributes for attempt_latencies
// Ignore error seen while creating metric attributes since metric can still
// be recorded with rest of the attributes
// Record attempt_latencies
attemptLatAttrs, _ := mt.toOtelMetricAttrs(metricNameAttemptLatencies)
mt.instrumentAttemptLatencies.Record(mt.ctx, elapsedTime, metric.WithAttributes(attemptLatAttrs...))

// Attributes for server_latencies
// Ignore error seen while creating metric attributes since metric can still
// be recorded with rest of the attributes
// Record server_latencies
serverLatAttrs, _ := mt.toOtelMetricAttrs(metricNameServerLatencies)
if mt.currOp.currAttempt.serverLatencyErr == nil {
mt.instrumentServerLatencies.Record(mt.ctx, mt.currOp.currAttempt.serverLatency, metric.WithAttributes(serverLatAttrs...))
Expand Down
5 changes: 5 additions & 0 deletions bigtable/metric_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"fmt"
"strconv"
"strings"
"time"

btpb "cloud.google.com/go/bigtable/apiv2/bigtablepb"
"google.golang.org/grpc/metadata"
Expand Down Expand Up @@ -92,3 +93,7 @@ func extractLocation(headerMD metadata.MD, trailerMD metadata.MD) (string, strin

return responseParams.GetClusterId(), responseParams.GetZoneId(), nil
}

func convertToMs(d time.Duration) float64 {
return float64(d.Nanoseconds()) / 1000000
}
15 changes: 11 additions & 4 deletions bigtable/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ const (
metricNameAttemptLatencies = "attempt_latencies"
metricNameServerLatencies = "server_latencies"
metricNameRetryCount = "retry_count"

// Metric units
metricUnitMS = "ms"
metricUnitCount = "1"
)

// These are effectively const, but for testing purposes they are mutable
Expand Down Expand Up @@ -207,7 +211,7 @@ func (tf *builtinMetricsTracerFactory) createInstruments(meter metric.Meter) err
tf.operationLatencies, err = meter.Float64Histogram(
metricNameOperationLatencies,
metric.WithDescription("Total time until final operation success or failure, including retries and backoff."),
metric.WithUnit("ms"),
metric.WithUnit(metricUnitMS),
metric.WithExplicitBucketBoundaries(bucketBounds...),
)
if err != nil {
Expand All @@ -218,7 +222,7 @@ func (tf *builtinMetricsTracerFactory) createInstruments(meter metric.Meter) err
tf.attemptLatencies, err = meter.Float64Histogram(
metricNameAttemptLatencies,
metric.WithDescription("Client observed latency per RPC attempt."),
metric.WithUnit("ms"),
metric.WithUnit(metricUnitMS),
metric.WithExplicitBucketBoundaries(bucketBounds...),
)
if err != nil {
Expand All @@ -229,7 +233,7 @@ func (tf *builtinMetricsTracerFactory) createInstruments(meter metric.Meter) err
tf.serverLatencies, err = meter.Float64Histogram(
metricNameServerLatencies,
metric.WithDescription("The latency measured from the moment that the RPC entered the Google data center until the RPC was completed."),
metric.WithUnit("ms"),
metric.WithUnit(metricUnitMS),
metric.WithExplicitBucketBoundaries(bucketBounds...),
)
if err != nil {
Expand All @@ -240,6 +244,7 @@ func (tf *builtinMetricsTracerFactory) createInstruments(meter metric.Meter) err
tf.retryCount, err = meter.Int64Counter(
metricNameRetryCount,
metric.WithDescription("The number of additional RPCs sent after the initial attempt."),
metric.WithUnit(metricUnitCount),
)
return err
}
Expand Down Expand Up @@ -268,6 +273,8 @@ type builtinMetricsTracer struct {
}

// opTracer is used to record metrics for the entire operation, including retries.
// Operation is a logical unit that represents a single method invocation on client.
// The method might require multiple attempts/rpcs and backoff logic to complete
type opTracer struct {
attemptCount int64

Expand All @@ -281,7 +288,6 @@ type opTracer struct {

func (o *opTracer) setStartTime(t time.Time) {
o.startTime = t

}

func (o *opTracer) setStatus(status string) {
Expand All @@ -293,6 +299,7 @@ func (o *opTracer) incrementAttemptCount() {
}

// attemptTracer is used to record metrics for each individual attempt of the operation.
// Attempt corresponds to an attempt of an RPC.
type attemptTracer struct {
startTime time.Time
clusterID string
Expand Down

0 comments on commit 0e5c5c9

Please sign in to comment.