Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

obs: export metrics about Go GC Assist work #118875

Merged
merged 1 commit into from
Feb 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/generated/metrics/metrics.html
Original file line number Diff line number Diff line change
Expand Up @@ -1558,6 +1558,7 @@
<tr><td>SERVER</td><td>sys.cpu.user.percent</td><td>Current user cpu percentage consumed by the CRDB process</td><td>CPU Time</td><td>GAUGE</td><td>PERCENT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>SERVER</td><td>sys.fd.open</td><td>Process open file descriptors</td><td>File Descriptors</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>SERVER</td><td>sys.fd.softlimit</td><td>Process open FD soft limit</td><td>File Descriptors</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>SERVER</td><td>sys.gc.assist.ns</td><td>Estimated total CPU time user goroutines spent to assist the GC process</td><td>CPU Time</td><td>GAUGE</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>SERVER</td><td>sys.gc.count</td><td>Total number of GC runs</td><td>GC Runs</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>SERVER</td><td>sys.gc.pause.ns</td><td>Total GC pause</td><td>GC Pause</td><td>GAUGE</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>SERVER</td><td>sys.gc.pause.percent</td><td>Current GC pause percentage</td><td>GC Pause</td><td>GAUGE</td><td>PERCENT</td><td>AVG</td><td>NONE</td></tr>
Expand Down
97 changes: 97 additions & 0 deletions pkg/server/status/runtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@ package status

import (
"context"
"fmt"
"os"
"regexp"
"runtime"
"runtime/debug"
"runtime/metrics"
"time"

"github.com/cockroachdb/cockroach/pkg/build"
Expand Down Expand Up @@ -93,6 +95,12 @@ var (
Measurement: "GC Pause",
Unit: metric.Unit_PERCENT,
}
metaGCAssistNS = metric.Metadata{
Name: "sys.gc.assist.ns",
Help: "Estimated total CPU time user goroutines spent to assist the GC process",
Measurement: "CPU Time",
Unit: metric.Unit_NANOSECONDS,
}

metaCPUUserNS = metric.Metadata{
Name: "sys.cpu.user.ns",
Expand Down Expand Up @@ -292,6 +300,85 @@ var diskMetricsIgnoredDevices = envutil.EnvOrDefaultString("COCKROACH_DISK_METRI
// error : any issues fetching stats. This should be a warning only.
var getCgoMemStats func(context.Context) (uint, uint, error)

// Estimated total CPU time goroutines spent performing GC tasks to assist the
// GC and prevent it from falling behind the application. This metric is an
// overestimate, and not directly comparable to system CPU time measurements.
// Compare only with other /cpu/classes metrics.
const runtimeMetricGCAssist = "/cpu/classes/gc/mark/assist:cpu-seconds"

var runtimeMetrics = []string{runtimeMetricGCAssist}

// GoRuntimeSampler are a collection of metrics to sample from golang's runtime environment and
// runtime metrics metadata. It fetches go runtime metrics and provides read access.
// https://pkg.go.dev/runtime/metrics
type GoRuntimeSampler struct {
// The collection of metrics we want to sample.
metricSamples []metrics.Sample
// The mapping to find metric slot in metricSamples by name.
metricIndexes map[string]int
}

// getIndex finds the position of metrics in the sample array by name.
func (grm *GoRuntimeSampler) getIndex(name string) int {
i, found := grm.metricIndexes[name]
if !found {
panic(fmt.Sprintf("unsampled metric: %s", name))
}
return i
}

// float64 gets the sampled value by metrics name as float64.
// N.B. This method will panic if the metrics value is not metrics.KindFloat64.
func (grm *GoRuntimeSampler) float64(name string) float64 {
i := grm.getIndex(name)
return grm.metricSamples[i].Value.Float64()
}

// sampleRuntimeMetrics reads from metrics.Read api and fill in the value
// in the metricSamples field.
// Benchmark results on 12 core Apple M3 Pro:
// goos: darwin
// goarch: arm64
// pkg: github.com/cockroachdb/cockroach/pkg/server/status
// BenchmarkGoRuntimeSampler
// BenchmarkGoRuntimeSampler-12 28886398 40.03 ns/op
//
// func BenchmarkGoRuntimeSampler(b *testing.B) {
// s := NewGoRuntimeSampler([]string{runtimeMetricGCAssist})
// for n := 0; n < b.N; n++ {
// s.sampleRuntimeMetrics()
// }
// }
func (grm *GoRuntimeSampler) sampleRuntimeMetrics() {
metrics.Read(grm.metricSamples)
}

// NewGoRuntimeSampler constructs a new GoRuntimeSampler object.
// This method will panic on invalid metrics names provided.
func NewGoRuntimeSampler(metricNames []string) *GoRuntimeSampler {
m := metrics.All()
metricTypes := make(map[string]metrics.ValueKind, len(m))
for _, desc := range m {
metricTypes[desc.Name] = desc.Kind
}
metricSamples := make([]metrics.Sample, len(metricNames))
metricIndexes := make(map[string]int, len(metricNames))
for i, n := range metricNames {
_, hasDesc := metricTypes[n]
if !hasDesc {
panic(fmt.Sprintf("unexpected metric: %s", n))
}
metricSamples[i] = metrics.Sample{Name: n}
metricIndexes[n] = i
}

grm := &GoRuntimeSampler{
metricSamples: metricSamples,
metricIndexes: metricIndexes,
}
return grm
}

// RuntimeStatSampler is used to periodically sample the runtime environment
// for useful statistics, performing some rudimentary calculations and storing
// the resulting information in a format that can be easily consumed by status
Expand Down Expand Up @@ -326,6 +413,8 @@ type RuntimeStatSampler struct {
// Only show "not implemented" errors once, we don't need the log spam.
fdUsageNotImplemented bool

goRuntimeSampler *GoRuntimeSampler

// Metric gauges maintained by the sampler.
// Go runtime stats.
CgoCalls *metric.Gauge
Expand All @@ -338,6 +427,7 @@ type RuntimeStatSampler struct {
GcCount *metric.Gauge
GcPauseNS *metric.Gauge
GcPausePercent *metric.GaugeFloat64
GcAssistNS *metric.Gauge
// CPU stats for the CRDB process usage.
CPUUserNS *metric.Gauge
CPUUserPercent *metric.GaugeFloat64
Expand Down Expand Up @@ -414,6 +504,7 @@ func NewRuntimeStatSampler(ctx context.Context, clock hlc.WallClock) *RuntimeSta
startTimeNanos: clock.Now().UnixNano(),
initialNetCounters: netCounters,
initialDiskCounters: diskCounters,
goRuntimeSampler: NewGoRuntimeSampler(runtimeMetrics),
CgoCalls: metric.NewGauge(metaCgoCalls),
Goroutines: metric.NewGauge(metaGoroutines),
RunnableGoroutinesPerCPU: metric.NewGaugeFloat64(metaRunnableGoroutinesPerCPU),
Expand All @@ -424,6 +515,7 @@ func NewRuntimeStatSampler(ctx context.Context, clock hlc.WallClock) *RuntimeSta
GcCount: metric.NewGauge(metaGCCount),
GcPauseNS: metric.NewGauge(metaGCPauseNS),
GcPausePercent: metric.NewGaugeFloat64(metaGCPausePercent),
GcAssistNS: metric.NewGauge(metaGCAssistNS),

CPUUserNS: metric.NewGauge(metaCPUUserNS),
CPUUserPercent: metric.NewGaugeFloat64(metaCPUUserPercent),
Expand Down Expand Up @@ -515,6 +607,8 @@ func (rsr *RuntimeStatSampler) SampleEnvironment(
gc := &debug.GCStats{}
debug.ReadGCStats(gc)

rsr.goRuntimeSampler.sampleRuntimeMetrics()

numCgoCall := runtime.NumCgoCall()
numGoroutine := runtime.NumGoroutine()

Expand Down Expand Up @@ -615,6 +709,8 @@ func (rsr *RuntimeStatSampler) SampleEnvironment(
combinedNormalizedHostPerc := (hostSrate + hostUrate) / float64(numHostCPUs)
gcPauseRatio := float64(uint64(gc.PauseTotal)-rsr.last.gcPauseTime) / dur
runnableSum := goschedstats.CumulativeNormalizedRunnableGoroutines()
gcAssistSeconds := rsr.goRuntimeSampler.float64(runtimeMetricGCAssist)
gcAssistNS := int64(gcAssistSeconds * 1e9)
// The number of runnable goroutines per CPU is a count, but it can vary
// quickly. We don't just want to get a current snapshot of it, we want the
// average value since the last sampling.
Expand Down Expand Up @@ -668,6 +764,7 @@ func (rsr *RuntimeStatSampler) SampleEnvironment(
rsr.GcCount.Update(gc.NumGC)
rsr.GcPauseNS.Update(int64(gc.PauseTotal))
rsr.GcPausePercent.Update(gcPauseRatio)
rsr.GcAssistNS.Update(gcAssistNS)

rsr.CPUUserNS.Update(procUtime)
rsr.CPUUserPercent.Update(procUrate)
Expand Down
Loading