From 3f1e5283d5a5803dd9b488fb054b5be0a12d3e63 Mon Sep 17 00:00:00 2001 From: lyang24 Date: Tue, 6 Feb 2024 19:56:55 -0800 Subject: [PATCH] obs: export metrics about Go GC Assist work This commit introduced functions to extract exposed metrics in go runtime metrics api. The runtime metrics is sampled along in SampleEnvironment call every 10 seconds. New metric GcAssistNS is captured in this pr to capture represent an estimate to amount of effort of user go routines assist in gc activities in nanoseconds. Fixes: https://github.com/cockroachdb/cockroach/issues/88178 Release note: None --- docs/generated/eventlog.md | 1 + docs/generated/metrics/metrics.html | 1 + pkg/server/status/runtime.go | 98 +++++++++++++++++++ pkg/util/log/eventpb/health_events.proto | 3 + pkg/util/log/eventpb/json_encode_generated.go | 9 ++ 5 files changed, 112 insertions(+) diff --git a/docs/generated/eventlog.md b/docs/generated/eventlog.md index 0ded8b5faae1..2910eeca56e8 100644 --- a/docs/generated/eventlog.md +++ b/docs/generated/eventlog.md @@ -266,6 +266,7 @@ An event of type `runtime_stats` is recorded every 10 seconds as server health m | `GCRunCount` | The total number of GC runs. | no | | `NetHostRecvBytes` | The bytes received on all network interfaces since this process started. | no | | `NetHostSendBytes` | The bytes sent on all network interfaces since this process started. | no | +| `GCAssistNs` | Estimated total CPU time user goroutines spent performing GC tasks to assist the GC. Expressed in nanoseconds. | no | #### Common fields diff --git a/docs/generated/metrics/metrics.html b/docs/generated/metrics/metrics.html index b4487a955b6d..53418bca74b8 100644 --- a/docs/generated/metrics/metrics.html +++ b/docs/generated/metrics/metrics.html @@ -1558,6 +1558,7 @@ SERVERsys.cpu.user.percentCurrent user cpu percentage consumed by the CRDB processCPU TimeGAUGEPERCENTAVGNONE SERVERsys.fd.openProcess open file descriptorsFile DescriptorsGAUGECOUNTAVGNONE SERVERsys.fd.softlimitProcess open FD soft limitFile DescriptorsGAUGECOUNTAVGNONE +SERVERsys.gc.assist.nsEstimated total CPU time user goroutines spent to assist the GC processCPU TimeGAUGENANOSECONDSAVGNONE SERVERsys.gc.countTotal number of GC runsGC RunsGAUGECOUNTAVGNONE SERVERsys.gc.pause.nsTotal GC pauseGC PauseGAUGENANOSECONDSAVGNONE SERVERsys.gc.pause.percentCurrent GC pause percentageGC PauseGAUGEPERCENTAVGNONE diff --git a/pkg/server/status/runtime.go b/pkg/server/status/runtime.go index 500b561c9432..30ed7b63791c 100644 --- a/pkg/server/status/runtime.go +++ b/pkg/server/status/runtime.go @@ -12,10 +12,12 @@ package status import ( "context" + "fmt" "os" "regexp" "runtime" "runtime/debug" + "runtime/metrics" "time" "github.com/cockroachdb/cockroach/pkg/build" @@ -93,6 +95,12 @@ var ( Measurement: "GC Pause", Unit: metric.Unit_PERCENT, } + metaGCAssistNS = metric.Metadata{ + Name: "sys.gc.assist.ns", + Help: "Estimated total CPU time user goroutines spent to assist the GC process", + Measurement: "CPU Time", + Unit: metric.Unit_NANOSECONDS, + } metaCPUUserNS = metric.Metadata{ Name: "sys.cpu.user.ns", @@ -292,6 +300,85 @@ var diskMetricsIgnoredDevices = envutil.EnvOrDefaultString("COCKROACH_DISK_METRI // error : any issues fetching stats. This should be a warning only. var getCgoMemStats func(context.Context) (uint, uint, error) +// Estimated total CPU time goroutines spent performing GC tasks to assist the +// GC and prevent it from falling behind the application. This metric is an +// overestimate, and not directly comparable to system CPU time measurements. +// Compare only with other /cpu/classes metrics. +const runtimeMetricGCAssist = "/cpu/classes/gc/mark/assist:cpu-seconds" + +var runtimeMetrics = []string{runtimeMetricGCAssist} + +// GoRuntimeSampler are a collection of metrics to sample from golang's runtime environment and +// runtime metrics metadata. It fetches go runtime metrics and provides read access. +// https://pkg.go.dev/runtime/metrics +type GoRuntimeSampler struct { + // The collection of metrics we want to sample. + metricSamples []metrics.Sample + // The mapping to find metric slot in metricSamples by name. + metricIndexes map[string]int +} + +// getIndex finds the position of metrics in the sample array by name. +func (grm *GoRuntimeSampler) getIndex(name string) int { + i, found := grm.metricIndexes[name] + if !found { + panic(fmt.Sprintf("unsampled metric: %s", name)) + } + return i +} + +// float64 gets the sampled value by metrics name as float64. +// N.B. This method will panic if the metrics value is not metrics.KindFloat64. +func (grm *GoRuntimeSampler) float64(name string) float64 { + i := grm.getIndex(name) + return grm.metricSamples[i].Value.Float64() +} + +// sampleRuntimeMetrics reads from metrics.Read api and fill in the value +// in the metricSamples field. +// Benchmark results on 12 core Apple M3 Pro: +// goos: darwin +// goarch: arm64 +// pkg: github.com/cockroachdb/cockroach/pkg/server/status +// BenchmarkGoRuntimeSampler +// BenchmarkGoRuntimeSampler-12 28886398 40.03 ns/op +// +// func BenchmarkGoRuntimeSampler(b *testing.B) { +// s := NewGoRuntimeSampler([]string{runtimeMetricGCAssist}) +// for n := 0; n < b.N; n++ { +// s.sampleRuntimeMetrics() +// } +// } +func (grm *GoRuntimeSampler) sampleRuntimeMetrics() { + metrics.Read(grm.metricSamples) +} + +// NewGoRuntimeSampler constructs a new GoRuntimeSampler object. +// This method will panic on invalid metrics names provided. +func NewGoRuntimeSampler(metricNames []string) *GoRuntimeSampler { + m := metrics.All() + metricTypes := make(map[string]metrics.ValueKind, len(m)) + for _, desc := range m { + metricTypes[desc.Name] = desc.Kind + } + metricSamples := make([]metrics.Sample, len(metricNames)) + metricIndexes := make(map[string]int, len(metricNames)) + for i, n := range metricNames { + _, hasDesc := metricTypes[n] + if !hasDesc { + panic(fmt.Sprintf("unexpected metric: %s", n)) + } + metricSamples[i] = metrics.Sample{Name: n} + metricIndexes[n] = i + } + + grm := &GoRuntimeSampler{ + metricSamples: metricSamples, + metricIndexes: metricIndexes, + } + return grm +} + // RuntimeStatSampler is used to periodically sample the runtime environment // for useful statistics, performing some rudimentary calculations and storing // the resulting information in a format that can be easily consumed by status @@ -326,6 +413,8 @@ type RuntimeStatSampler struct { // Only show "not implemented" errors once, we don't need the log spam. fdUsageNotImplemented bool + goRuntimeSampler *GoRuntimeSampler + // Metric gauges maintained by the sampler. // Go runtime stats. CgoCalls *metric.Gauge @@ -338,6 +427,7 @@ type RuntimeStatSampler struct { GcCount *metric.Gauge GcPauseNS *metric.Gauge GcPausePercent *metric.GaugeFloat64 + GcAssistNS *metric.Gauge // CPU stats for the CRDB process usage. CPUUserNS *metric.Gauge CPUUserPercent *metric.GaugeFloat64 @@ -414,6 +504,7 @@ func NewRuntimeStatSampler(ctx context.Context, clock hlc.WallClock) *RuntimeSta startTimeNanos: clock.Now().UnixNano(), initialNetCounters: netCounters, initialDiskCounters: diskCounters, + goRuntimeSampler: NewGoRuntimeSampler(runtimeMetrics), CgoCalls: metric.NewGauge(metaCgoCalls), Goroutines: metric.NewGauge(metaGoroutines), RunnableGoroutinesPerCPU: metric.NewGaugeFloat64(metaRunnableGoroutinesPerCPU), @@ -424,6 +515,7 @@ func NewRuntimeStatSampler(ctx context.Context, clock hlc.WallClock) *RuntimeSta GcCount: metric.NewGauge(metaGCCount), GcPauseNS: metric.NewGauge(metaGCPauseNS), GcPausePercent: metric.NewGaugeFloat64(metaGCPausePercent), + GcAssistNS: metric.NewGauge(metaGCAssistNS), CPUUserNS: metric.NewGauge(metaCPUUserNS), CPUUserPercent: metric.NewGaugeFloat64(metaCPUUserPercent), @@ -515,6 +607,8 @@ func (rsr *RuntimeStatSampler) SampleEnvironment( gc := &debug.GCStats{} debug.ReadGCStats(gc) + rsr.goRuntimeSampler.sampleRuntimeMetrics() + numCgoCall := runtime.NumCgoCall() numGoroutine := runtime.NumGoroutine() @@ -615,6 +709,8 @@ func (rsr *RuntimeStatSampler) SampleEnvironment( combinedNormalizedHostPerc := (hostSrate + hostUrate) / float64(numHostCPUs) gcPauseRatio := float64(uint64(gc.PauseTotal)-rsr.last.gcPauseTime) / dur runnableSum := goschedstats.CumulativeNormalizedRunnableGoroutines() + gcAssistSeconds := rsr.goRuntimeSampler.float64(runtimeMetricGCAssist) + gcAssistNS := int64(gcAssistSeconds * 1e9) // The number of runnable goroutines per CPU is a count, but it can vary // quickly. We don't just want to get a current snapshot of it, we want the // average value since the last sampling. @@ -651,6 +747,7 @@ func (rsr *RuntimeStatSampler) SampleEnvironment( GCRunCount: uint64(gc.NumGC), NetHostRecvBytes: deltaNet.BytesRecv, NetHostSendBytes: deltaNet.BytesSent, + GCAssistNs: uint64(gcAssistNS), } logStats(ctx, stats) @@ -668,6 +765,7 @@ func (rsr *RuntimeStatSampler) SampleEnvironment( rsr.GcCount.Update(gc.NumGC) rsr.GcPauseNS.Update(int64(gc.PauseTotal)) rsr.GcPausePercent.Update(gcPauseRatio) + rsr.GcAssistNS.Update(gcAssistNS) rsr.CPUUserNS.Update(procUtime) rsr.CPUUserPercent.Update(procUrate) diff --git a/pkg/util/log/eventpb/health_events.proto b/pkg/util/log/eventpb/health_events.proto index 35c33f13c879..8f758760676d 100644 --- a/pkg/util/log/eventpb/health_events.proto +++ b/pkg/util/log/eventpb/health_events.proto @@ -64,4 +64,7 @@ message RuntimeStats { uint64 net_host_recv_bytes = 18 [(gogoproto.jsontag) = ",omitempty"]; // The bytes sent on all network interfaces since this process started. uint64 net_host_send_bytes = 19 [(gogoproto.jsontag) = ",omitempty"]; + // Estimated total CPU time user goroutines spent performing GC tasks to + // assist the GC. Expressed in nanoseconds. + uint64 gc_assist_ns = 20 [(gogoproto.customname) = "GCAssistNs", (gogoproto.jsontag) = ",omitempty"]; } diff --git a/pkg/util/log/eventpb/json_encode_generated.go b/pkg/util/log/eventpb/json_encode_generated.go index ea3e473b0b47..b357283068f4 100644 --- a/pkg/util/log/eventpb/json_encode_generated.go +++ b/pkg/util/log/eventpb/json_encode_generated.go @@ -4251,6 +4251,15 @@ func (m *RuntimeStats) AppendJSONFields(printComma bool, b redact.RedactableByte b = strconv.AppendUint(b, uint64(m.NetHostSendBytes), 10) } + if m.GCAssistNs != 0 { + if printComma { + b = append(b, ',') + } + printComma = true + b = append(b, "\"GCAssistNs\":"...) + b = strconv.AppendUint(b, uint64(m.GCAssistNs), 10) + } + return printComma, b }