refactor(bigtable): Refactoring client side metrics code (#10623)

googleapis · Aug 1, 2024 · 0e5c5c9 · 0e5c5c9
1 parent 47941b3
commit 0e5c5c9
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 21 deletions.
diff --git a/bigtable/bigtable.go b/bigtable/bigtable.go
@@ -1531,28 +1531,25 @@ func (t *Table) newBuiltinMetricsTracer(ctx context.Context, isStreaming bool) *
 }
 
 // recordOperationCompletion records as many operation specific metrics as it can
+// Ignores error seen while creating metric attributes since metric can still
+// be recorded with rest of the attributes
 func recordOperationCompletion(mt *builtinMetricsTracer) {
 	if !mt.builtInEnabled {
 		return
 	}
 
 	// Calculate elapsed time
-	elapsedTimeMs := float64(time.Since(mt.currOp.startTime).Nanoseconds()) / 1000000
+	elapsedTimeMs := convertToMs(time.Since(mt.currOp.startTime))
 
-	// Attributes for operation_latencies
-	// Ignore error seen while creating metric attributes since metric can still
-	// be recorded with rest of the attributes
+	// Record operation_latencies
 	opLatAttrs, _ := mt.toOtelMetricAttrs(metricNameOperationLatencies)
 	mt.instrumentOperationLatencies.Record(mt.ctx, elapsedTimeMs, metric.WithAttributes(opLatAttrs...))
 
-	// Attributes for retry_count
-	// Ignore error seen while creating metric attributes since metric can still
-	// be recorded with rest of the attributes
+	// Record retry_count
 	retryCntAttrs, _ := mt.toOtelMetricAttrs(metricNameRetryCount)
-
-	// Only record when retry count is greater than 0 so the retry
-	// graph will be less confusing
 	if mt.currOp.attemptCount > 1 {
+		// Only record when retry count is greater than 0 so the retry
+		// graph will be less confusing
 		mt.instrumentRetryCount.Add(mt.ctx, mt.currOp.attemptCount-1, metric.WithAttributes(retryCntAttrs...))
 	}
 }
@@ -1604,23 +1601,21 @@ func gaxInvokeWithRecorder(ctx context.Context, mt *builtinMetricsTracer, method
 }
 
 // recordAttemptCompletion records as many attempt specific metrics as it can
+// Ignore errors seen while creating metric attributes since metric can still
+// be recorded with rest of the attributes
 func recordAttemptCompletion(mt *builtinMetricsTracer) {
 	if !mt.builtInEnabled {
 		return
 	}
 
 	// Calculate elapsed time
-	elapsedTime := float64(time.Since(mt.currOp.currAttempt.startTime).Nanoseconds()) / 1000000
+	elapsedTime := convertToMs(time.Since(mt.currOp.currAttempt.startTime))
 
-	// Attributes for attempt_latencies
-	// Ignore error seen while creating metric attributes since metric can still
-	// be recorded with rest of the attributes
+	// Record attempt_latencies
 	attemptLatAttrs, _ := mt.toOtelMetricAttrs(metricNameAttemptLatencies)
 	mt.instrumentAttemptLatencies.Record(mt.ctx, elapsedTime, metric.WithAttributes(attemptLatAttrs...))
 
-	// Attributes for server_latencies
-	// Ignore error seen while creating metric attributes since metric can still
-	// be recorded with rest of the attributes
+	// Record server_latencies
 	serverLatAttrs, _ := mt.toOtelMetricAttrs(metricNameServerLatencies)
 	if mt.currOp.currAttempt.serverLatencyErr == nil {
 		mt.instrumentServerLatencies.Record(mt.ctx, mt.currOp.currAttempt.serverLatency, metric.WithAttributes(serverLatAttrs...))

diff --git a/bigtable/metric_util.go b/bigtable/metric_util.go
@@ -20,6 +20,7 @@ import (
 	"fmt"
 	"strconv"
 	"strings"
+	"time"
 
 	btpb "cloud.google.com/go/bigtable/apiv2/bigtablepb"
 	"google.golang.org/grpc/metadata"
@@ -92,3 +93,7 @@ func extractLocation(headerMD metadata.MD, trailerMD metadata.MD) (string, strin
 
 	return responseParams.GetClusterId(), responseParams.GetZoneId(), nil
 }
+
+func convertToMs(d time.Duration) float64 {
+	return float64(d.Nanoseconds()) / 1000000
+}
diff --git a/bigtable/metrics.go b/bigtable/metrics.go
@@ -60,6 +60,10 @@ const (
 	metricNameAttemptLatencies   = "attempt_latencies"
 	metricNameServerLatencies    = "server_latencies"
 	metricNameRetryCount         = "retry_count"
+
+	// Metric units
+	metricUnitMS    = "ms"
+	metricUnitCount = "1"
 )
 
 // These are effectively const, but for testing purposes they are mutable
@@ -207,7 +211,7 @@ func (tf *builtinMetricsTracerFactory) createInstruments(meter metric.Meter) err
 	tf.operationLatencies, err = meter.Float64Histogram(
 		metricNameOperationLatencies,
 		metric.WithDescription("Total time until final operation success or failure, including retries and backoff."),
-		metric.WithUnit("ms"),
+		metric.WithUnit(metricUnitMS),
 		metric.WithExplicitBucketBoundaries(bucketBounds...),
 	)
 	if err != nil {
@@ -218,7 +222,7 @@ func (tf *builtinMetricsTracerFactory) createInstruments(meter metric.Meter) err
 	tf.attemptLatencies, err = meter.Float64Histogram(
 		metricNameAttemptLatencies,
 		metric.WithDescription("Client observed latency per RPC attempt."),
-		metric.WithUnit("ms"),
+		metric.WithUnit(metricUnitMS),
 		metric.WithExplicitBucketBoundaries(bucketBounds...),
 	)
 	if err != nil {
@@ -229,7 +233,7 @@ func (tf *builtinMetricsTracerFactory) createInstruments(meter metric.Meter) err
 	tf.serverLatencies, err = meter.Float64Histogram(
 		metricNameServerLatencies,
 		metric.WithDescription("The latency measured from the moment that the RPC entered the Google data center until the RPC was completed."),
-		metric.WithUnit("ms"),
+		metric.WithUnit(metricUnitMS),
 		metric.WithExplicitBucketBoundaries(bucketBounds...),
 	)
 	if err != nil {
@@ -240,6 +244,7 @@ func (tf *builtinMetricsTracerFactory) createInstruments(meter metric.Meter) err
 	tf.retryCount, err = meter.Int64Counter(
 		metricNameRetryCount,
 		metric.WithDescription("The number of additional RPCs sent after the initial attempt."),
+		metric.WithUnit(metricUnitCount),
 	)
 	return err
 }
@@ -268,6 +273,8 @@ type builtinMetricsTracer struct {
 }
 
 // opTracer is used to record metrics for the entire operation, including retries.
+// Operation is a logical unit that represents a single method invocation on client.
+// The method might require multiple attempts/rpcs and backoff logic to complete
 type opTracer struct {
 	attemptCount int64
 
@@ -281,7 +288,6 @@ type opTracer struct {
 
 func (o *opTracer) setStartTime(t time.Time) {
 	o.startTime = t
-
 }
 
 func (o *opTracer) setStatus(status string) {
@@ -293,6 +299,7 @@ func (o *opTracer) incrementAttemptCount() {
 }
 
 // attemptTracer is used to record metrics for each individual attempt of the operation.
+// Attempt corresponds to an attempt of an RPC.
 type attemptTracer struct {
 	startTime time.Time
 	clusterID string