diff --git a/docs/RFCS/20220602_fine_grained_cpu_attribution.md b/docs/RFCS/20220602_fine_grained_cpu_attribution.md
new file mode 100644
index 000000000000..15e957e55c89
--- /dev/null
+++ b/docs/RFCS/20220602_fine_grained_cpu_attribution.md
@@ -0,0 +1,617 @@
+- Feature Name: Fine-grained CPU attribution
+- Status: accepted
+- Start Date: 2022-06-02
+- Authors: Irfan Sharif, Austen McClernon
+- RFC PR: [#82356](https://github.com/cockroachdb/cockroach/pull/82356)
+- Cockroach Issue: N/A
+
+# Summary
+
+We propose using a patched Go runtime to track CPU use at the level of
+individual goroutines, using the primitive to derive an accurate resource
+consumption signal in various CRDB subsystems. The compiler, language, and
+tooling will remain the same.
+
+*NOTE: This RFC started off as an internal gdoc (grep for "Fine-grained CPU
+attribution" with valuable commentary and additional links).*
+
+## Motivation
+
+CRDB lacks primitives to attribute CPU usage to specific scopes (ranges,
+tenants, sessions, statements) or processes (pebble compactions, snapshot
+generation). For subsystems that rely on such attribution, we use proxy signals
+like '# of batch requests' or 'size of read/write request' as observed by a
+range/replica or a given tenant. With respect to modeling CPU usage, this is:
+- Inaccurate: two proportions of the signal can translate to different
+ proportions of actual CPU usage;
+- Imprecise: repeated equal measurements of the signal can correspond to varied
+ actual CPU usage.
+
+The inaccuracy and imprecision are due to:
+- Different batch requests having different compositions with respect to the
+ underlying requests;
+- Different request types varying in how much CPU activity they incur;
+- Requests on different stores having different CPU activity depending on LSM
+ health, sub-levels, latch contention with other requests.
+- Requests being variadic in their size and costing accordingly.
+
+This can lead to suboptimal decisions (placement and forecasting in the
+allocator, resource control in the per-store tenant rate limiter), and is
+difficult to reason about. The latter also becomes important as we build
+[visualizations](https://github.com/cockroachdb/cockroach/pull/76895) for CRDB
+internal state for end-users.
+
+Our proxy signals don’t generalize to other subsystems, some of which
+consequently are CPU unaware. Changefeed processor placement for e.g. is
+agnostic to the CPU usage driven by the processors themselves, and can lead to
+CPU hotspots and poor cluster-wide resource utilization. It's difficult today to
+answer how much cluster-wide CPU usage a single statement execution drove, or
+what % of CPU on a given node is due to activity on a specific index, or driven
+by a specific tenant.
+
+## Design
+
+The time spent per-goroutine in the running state (henceforth "CPU time") is an
+accurate and precise measure for CPU usage. This is not currently tracked by
+the Go runtime (though there's an
+[issue](https://github.com/golang/go/issues/41554) upstream); we propose doing
+so with the following patch:
+
+```diff
+diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
+index 1e4f872726..ced56dc4f6 100644
+--- a/src/runtime/runtime2.go
++++ b/src/runtime/runtime2.go
+@@ -488,6 +487,9 @@ type g struct {
+ labels unsafe.Pointer // profiler labels
+ timer *timer // cached timer for time.Sleep
+ selectDone uint32 // are we participating in a select and did someone win the race?
++ lastsched int64 // timestamp when the G last started running
++ runningnanos int64 // wall time spent in the running state
+
+
+diff --git a/src/runtime/proc.go b/src/runtime/proc.go
+index f5e528e8e9..9f938311b8 100644
+--- a/src/runtime/proc.go
++++ b/src/runtime/proc.go
+@@ -994,8 +994,18 @@ func casgstatus(gp *g, oldval, newval uint32) {
+ }
+ }
+
+- // Handle tracking for scheduling latencies.
++ // Handle tracking for scheduling and running latencies.
++ now := nanotime()
++ if newval == _Grunning {
++ // We're transitioning into the running state, record the timestamp for
++ // subsequent use.
++ gp.lastsched = now
++ }
+ if oldval == _Grunning {
++ // We're transitioning out of running, record how long we were in the
++ // state.
++ gp.runningnanos += now - gp.lastsched
++
+ // Track every 8th time a goroutine transitions out of running.
+ if gp.trackingSeq%gTrackingPeriod == 0 {
+ gp.tracking = true
+@@ -1007,14 +1017,12 @@ func casgstatus(gp *g, oldval, newval uint32) {
+ // We transitioned out of runnable, so measure how much
+ // time we spent in this state and add it to
+ // runnableTime.
+- now := nanotime()
+ gp.runnableTime += now - gp.runnableStamp
+ gp.runnableStamp = 0
+ }
+ if newval == _Grunnable {
+ // We just transitioned into runnable, so record what
+ // time that happened.
+- now := nanotime()
+ gp.runnableStamp = now
+ } else if newval == _Grunning {
+ // We're transitioning into running, so turn off
+@@ -3258,6 +3266,14 @@ func dropg() {
+ setGNoWB(&_g_.m.curg, nil)
+ }
+
++// grunningnanos returns the wall time spent by current g in the running state.
++// A goroutine may be running on an OS thread that's descheduled by the OS
++// scheduler, this time still counts towards the metric.
++func grunningnanos() int64 {
++ gp := getg()
++ return gp.runningnanos + nanotime() - gp.lastsched
++}
++
+ // checkTimers runs any timers for the P that are ready.
+ // If now is not 0 it is the current time.
+ // It returns the passed time or the current time if now was passed as 0.
+@@ -3491,6 +3507,8 @@ func goexit0(gp *g) {
+ gp.param = nil
+ gp.labels = nil
+ gp.timer = nil
++ gp.lastsched = 0
++ gp.runningnanos = 0
+
+ if gcBlackenEnabled != 0 && gp.gcAssistBytes > 0 {
+ // Flush assist credit to the global pool. This gives
+diff --git a/src/runtime/sizeof_test.go b/src/runtime/sizeof_test.go
+index 9ce0a3afcd..71de8052bd 100644
+--- a/src/runtime/sizeof_test.go
++++ b/src/runtime/sizeof_test.go
+@@ -21,7 +21,7 @@ func TestSizeof(t *testing.T) {
+ _32bit uintptr // size on 32bit platforms
+ _64bit uintptr // size on 64bit platforms
+ }{
+- {runtime.G{}, 240, 392}, // g, but exported for testing
++ {runtime.G{}, 256, 408}, // g, but exported for testing
+ {runtime.Sudog{}, 56, 88}, // sudog, but exported for testing
+ }
+```
+
+Background: The runtime maintains a `type g struct` for every goroutine, and
+the scheduler is responsible for transitioning each one through various
+[states](https://github.com/golang/go/blob/afd181cf0b69c3591d7e47ceca4fabf14434d77e/src/runtime/runtime2.go#L14-L85).
+`_Grunning` is the one we're interested in, which indicates that the goroutine
+may execute user (i.e. CRDB) code. The goroutine is also assigned to an OS
+thread (`type m struct`) that is in turn assigned to a CPU core (`type p
+struct`). In addition to the `src/runtime` [package
+docs](https://github.com/golang/go/blob/cc4957a5f6eba946f359ed9646ec3e5083a259a9/src/runtime/proc.go#L19-L111),
+[Kavya Joshi's](https://www.youtube.com/watch?v=YHRO5WQGh0k) and [Dmitry
+Vyukov's](https://www.youtube.com/watch?v=-K11rY57K7k) presentations on the
+runtime internals can serve as helpful reading material.
+
+At the point where a `g` transitions in and out of the `_Grunning` state, we
+maintain per-`g` counters that capture the wall time spent in that state. It's
+possible for an OS thread (`m`) that a goroutine (`g`) was running on to be
+descheduled by the OS scheduler in favor of non-CRDB processes running on the
+same node. This is invisible to the Go runtime, and as such, the patch above
+will count this off-CPU time towards the per-`g` total (this is evaluated
+below).
+
+### Comparison to RUs and '# of batch requests'
+
+In multi-tenant environments we currently model CPU usage using a linear model
+of the '# of read/write requests' and their corresponding sizes. We use this
+linear model for cost attribution and resource control, models that are more or
+less accurate depending on how the running workload compares to the workloads
+we've trained the linear model using. Improved signal accuracy for CPU
+translates to:
+- Better accounting independent of how we present consumption (RUs, rows
+ read/written), we should know what patterns we're subsidizing;
+- Performance predictability in multi-tenant environments.
+
+Derived signals like '# of batch requests' are hard to map to the underlying
+hardware capacities or utilization. Base signals like 'time-spent on CPU', in
+contrast, are easier to normalize to capacity and infer utilization from. We
+imagine introducing similar base signals for other resources of interest (disk
+IOPs and bandwidth) and tooling rebalancing algorithms to directly consider
+independent hardware dimensions separately instead of using [signals that
+combine](https://github.com/cockroachdb/cockroach/issues/34590)
+dimensions.[disk-util]. We can try
+[calibrating](https://github.com/cockroachdb/cockroach/pull/76252) derived
+signal models using experimental data, an expensive process even without varying
+for saturation, hardware, contention, etc. Even perfect models would need
+recalibration as the implementation changes, placing practical limits on how
+accurate they can be.
+
+### Short-term use cases
+
+We propose the following features to build experience with using a modified
+runtime, and to further evaluate signal accuracy:
+- Observability: A per-store CPU usage breakdown by ranges and tenants, powered
+ by measured CPU time. We'd make this accessible through vtables and include
+ results in debug zips, serving as a CPU-only version of today's hot-ranges
+ report. This was prototyped, results below.
+- Observability: Surfacing per-statement cluster-wide CPU usage as part of
+ `EXPLAIN ANALYZE`.
+- (stretch goal) Tenant isolation: Integrating measured CPU time in the [tenant
+ rate limiter](https://github.com/cockroachdb/cockroach/issues/77041) to
+ improve performance predictability observed by tenants sharing KV nodes.
+ - We foresee integrating the tenant rate limiter into admission control which
+ would help evaluate how effective this is as a signal for CPU resource
+ control.
+
+From an earlier
+[prototype](https://github.com/irfansharif/cockroach/tree/220505.cputime-demo)
+exposing per-store replica/tenant CPU time:
+
+![](20220602_fine_grained_cpu_attribution/vtables.png)
+
+![](20220602_fine_grained_cpu_attribution/index-nanos.png)
+
+### Usage across machines, goroutines, and code paths
+
+It's common to spawn multiple goroutines for the same operation and to also do
+so across RPC boundaries. For the latter (say, if looking to accumulate
+cluster-wide CPU usage for a given statement) we can propagate per-node
+request-scoped total CPU time using tracing events; this is plumbing that
+already exists. For request-scoped total CPU time across cooperating goroutines,
+we can atomically maintain a counter stashed in the surrounding context. To
+retrieve per-goroutine running time, we imagine doing so in libraries we already
+use to manage goroutine lifetimes (`Stopper`, `ctxgroup`), lending more weight
+to [banning](https://github.com/cockroachdb/cockroach/issues/58164) naked `go`s
+altogether.
+
+For work done per-store that's attributable to an individual range or tenant, we
+don’t go through a single code path (the `Sender` stack). Consider KV queues
+that step through individual replicas at a time and do some work on behalf of
+each one, work we’d perhaps want to attribute to the specific replicas/tenants.
+In prototypes, we captured CPU profiles from KV under stock workloads to get a
+broad sense of what stack traces are attributable to individual ranges/tenants
+and opted into tracking at those points (purple segments in the figure below) to
+maintain a store-level view of how much CPU activity is driven by a specific
+range/tenant.
+![](20220602_fine_grained_cpu_attribution/pprof.png)
+
+### Development and release
+
+We point to [mirrored
+artifacts](https://github.com/cockroachdb/cockroach/blob/1e1ff14e73680b1a0e2877f8dc0bc56a657fc50c/WORKSPACE#L148-L150)
+of the Go runtime that engineers use when building/running tests through Bazel.
+The official CRDB release binaries [use
+Bazel](https://github.com/cockroachdb/cockroach/pull/76897), using the same
+mirrored runtime. Pointing to a modified runtime[release-go] is a
+matter of hosting it in publicly maintained buckets for things to "just work"
+([prototype](https://github.com/irfansharif/runner/blob/153154c3cd9825ef067a11bf98ef8b0501db54d0/WORKSPACE#L38-L48)).
+As for deprecated Make based workflows, we can ensure that CRDB library
+components/tests that make use of the runtime patch are gated behind a
+bazel-only build tag. This would also provide IDE integration.
+
+### Future runtime changes
+
+We argue that this proposal should be considered more along the lines of a
+patchset (and one that we [intend to
+upstream](https://github.com/golang/go/pull/51347)) than a hard fork. A parallel
+is whenever we patch a 3rd-party library and point to CRDB forks, we still send
+out PRs and point back to upstream SHAs once they’re merged. In this lens, Bazel
+only provides machinery to do the same for the Go runtime. The review
+discussions around a new public API for the counter is an important
+consideration for the Go team but less so for us; we’d happily `go:linkname`
+against tracked fields or private helpers to get what we need (we do similar
+things for [goroutine
+IDs](https://github.com/petermattis/goid/blob/07eaf5d0b9f4a816ddb51ceb22e6a9a16eef7d33/runtime_go1.9.go#L21-L36)
+and [runnable
+goroutines](https://github.com/cockroachdb/cockroach/tree/6b87aa6/pkg/util/goschedstats)).
+Independent of if/when it’s upstreamed, we can avoid coupling using precise
+attribution to the major Go release the change possibly appears; the diff is
+conservative in size to make it easy to patch to minor/major Go releases going
+forward. We expect future proposals for runtime changes, if any, to be evaluated
+on their own terms. Some guidelines to consider: patch size, review expertise,
+upstream-ability, fallback behavior without runtime changes, proximity to
+`crypto` ("core" functionality in general), and performance implications.
+
+## Evaluation
+
+Under stock TPC-C runs we verified that measured CPU time was:
+- Stable over time, under idle conditions and sustained load;
+- Trends with actual CPU usage (baseline: CRDB process CPU %), can be normalized
+ to capacity and used to forecast effect of range/tenant movement.
+
+Smoke tests running `kv --init --read-percent 50 --concurrency 1000 --ramp 1m
+--duration 4m` on a multi-node cluster showed no discernible throughput or
+latency impact when built with the proposed Go changes. Retrieving the goroutine
+running time (included is the vDSO call to `nanotime()`) is in the order of
+nanoseconds:
+
+```
+goos: linux
+goarch: amd64
+cpu: Intel(R) Xeon(R) CPU @ 2.20GHz
+BenchmarkGRunningNanos
+BenchmarkGRunningNanos-24 195321096 30.67 ns/op
+BenchmarkGRunningNanos-24 195100147 30.77 ns/op
+BenchmarkGRunningNanos-24 195415414 30.71 ns/op
+BenchmarkGRunningNanos-24 195564742 30.70 ns/op
+BenchmarkGRunningNanos-24 195472393 30.70 ns/op
+PASS
+```
+
+Microbenchmarks from the Go runtime that evaluate scheduler behavior show
+little to no impact:
+
+```
+goos: linux
+goarch: amd64
+cpu: Intel(R) Xeon(R) CPU @ 2.20GHz
+name old time/op new time/op delta
+PingPongHog-24 517ns ± 8% 513ns ±13% ~ (p=0.690 n=5+5)
+CreateGoroutines-24 302ns ± 1% 304ns ± 1% ~ (p=0.310 n=5+5)
+CreateGoroutinesParallel-24 33.6ns ± 1% 34.0ns ± 1% +1.30% (p=0.032 n=5+5)
+CreateGoroutinesCapture-24 2.86µs ± 2% 2.89µs ± 1% ~ (p=0.310 n=5+5)
+CreateGoroutinesSingle-24 406ns ± 0% 407ns ± 1% ~ (p=0.421 n=5+5)
+WakeupParallelSpinning/0s-24 14.7µs ± 1% 14.7µs ± 2% ~ (p=0.548 n=5+5)
+WakeupParallelSpinning/1µs-24 19.0µs ± 3% 18.9µs ± 3% ~ (p=1.000 n=5+5)
+WakeupParallelSpinning/2µs-24 24.4µs ± 4% 24.7µs ± 2% ~ (p=0.421 n=5+5)
+WakeupParallelSpinning/5µs-24 36.7µs ± 3% 37.0µs ± 1% ~ (p=0.548 n=5+5)
+WakeupParallelSpinning/10µs-24 54.0µs ± 0% 54.0µs ± 1% ~ (p=0.802 n=5+5)
+WakeupParallelSpinning/20µs-24 96.3µs ± 0% 96.1µs ± 0% ~ (p=0.222 n=5+5)
+WakeupParallelSpinning/50µs-24 222µs ± 0% 222µs ± 0% ~ (p=0.690 n=5+5)
+WakeupParallelSpinning/100µs-24 386µs ± 2% 391µs ± 3% ~ (p=0.310 n=5+5)
+WakeupParallelSyscall/0s-24 171µs ± 1% 170µs ± 1% ~ (p=0.095 n=5+5)
+WakeupParallelSyscall/1µs-24 173µs ± 1% 172µs ± 1% ~ (p=0.222 n=5+5)
+WakeupParallelSyscall/2µs-24 176µs ± 2% 174µs ± 1% ~ (p=0.421 n=5+5)
+WakeupParallelSyscall/5µs-24 183µs ± 1% 184µs ± 1% ~ (p=0.095 n=5+5)
+WakeupParallelSyscall/10µs-24 190µs ± 0% 193µs ± 1% +1.49% (p=0.008 n=5+5)
+WakeupParallelSyscall/20µs-24 213µs ± 1% 213µs ± 1% ~ (p=0.548 n=5+5)
+WakeupParallelSyscall/50µs-24 274µs ± 1% 275µs ± 3% ~ (p=0.690 n=5+5)
+WakeupParallelSyscall/100µs-24 377µs ± 1% 380µs ± 3% ~ (p=0.151 n=5+5)
+Matmult-24 0.97ns ± 1% 0.97ns ± 2% ~ (p=0.841 n=5+5)
+
+name old alloc/op new alloc/op delta
+CreateGoroutinesCapture-24 144B ± 0% 144B ± 0% ~ (all equal)
+
+name old allocs/op new allocs/op delta
+CreateGoroutinesCapture-24 5.00 ± 0% 5.00 ± 0% ~ (all equal)
+```
+
+The metric is also accurate (matches actual on-CPU proportions) and precise
+(repeated measurements have low variability). To verify, we used [a form
+of](https://github.com/irfansharif/runner/blob/3279169983005ccff797269df11bdc6e1897e48f/runtime_test.go#L70-L172)
+the tests proposed in [go#36821](https://github.com/golang/go/issues/36821):
+`TestEquivalentGoroutines` and `TestProportionalGoroutines`.
+
+```
+=== RUN TestEquivalentGoroutines # want ~10% for each
+ 0's got 9.98% of total time
+ 1's got 9.53% of total time
+ 2's got 9.22% of total time
+ 3's got 10.42% of total time
+ 4's got 9.84% of total time
+ 5's got 10.43% of total time
+ 6's got 10.50% of total time
+ 7's got 10.21% of total time
+ 8's got 10.03% of total time
+ 9's got 9.86% of total time
+
+=== RUN TestProportionalGoroutines # want incrementing multipliers
+ 0's got 1.87% of total time (1.000000x)
+ 1's got 3.60% of total time (1.931999x)
+ 2's got 5.41% of total time (2.899312x)
+ 3's got 7.21% of total time (3.864451x)
+ 4's got 9.11% of total time (4.880925x)
+ 5's got 10.94% of total time (5.864723x)
+ 6's got 12.77% of total time (6.842004x)
+ 7's got 14.34% of total time (7.685840x)
+ 8's got 16.58% of total time (8.885060x)
+ 9's got 18.18% of total time (9.741030x)
+```
+
+TODO: Evaluate inaccuracy given time spent descheduled by the OS is also
+counted here. The Go scheduler tries its best to hog the processors it’s been
+allotted, and given we’re running goroutines much higher than the number of
+processors, the observed inaccuracy across all goroutines (only the ones pinned
+to a descheduled OS thread is overcounted) should be diminishingly small.
+
+## Alternatives
+
+There are a few options to get CPU attribution with varying degrees of
+accuracy, granularity and convenience. It’s possible we incorporate one or more
+of the listed alternatives in the future.
+
+### Sampling profiler
+
+We can get coarse-grained, non-instantaneous CPU attribution using pprof and
+[profiler labels](https://rakyll.org/profiler-labels/). A sketch of what this
+would look like was [prototyped
+here](https://github.com/cockroachdb/cockroach/pull/60508), and has roughly the
+following idea.
+- We'd make use of profiler labels at points of interest (code paths where
+ requests to a single range/tenant feed through for e.g.)
+- Go automatically propagates profiler labels across goroutine boundaries with
+ shared contexts. gRPC also propagates labels across RPC boundaries.
+- We'd periodically capture profiling data and use it to compute the %es of CPU
+ time attributable to specific scopes.
+
+Comparison:
+
+- Does not necessitate runtime changes.
+- Provides coarse-grained attribution, which is likely sufficient for things
+ like the allocator or the tenant rate limiter, but harder to apply to
+ per-statement/request accounting.
+ - Sampling rates have a maximum frequency of 100hz with implications on
+ fidelity.
+ - Sampling-backed signals are accurate over a larger number of samples
+ (i.e. non-instantaneous), which is less-than-usable when we care about
+ capturing CPU data for executions in the tail (think statements that
+ don't take that long, don't happen frequently).
+- CPU profiling with labels is allocation inefficient and has a higher
+ performance overhead (though bounded to <1%; TODO: more numbers). Profiles
+ capture the entire stack trace; for CPU attribution we only need to identify
+ the running goroutine.
+- CPU attribution from sampling data can be inaccurate and imprecise, as
+ observed in [go#36821](https://github.com/golang/go/issues/36821); measured
+ CPU time does better as shown in the evaluation above.
+
+
+### Task groups
+
+Another proposal for CPU attribution is introducing two new abstractions in the
+Go runtime:
+[taskgroups](https://github.com/knz/cockroach/blob/20210215-task-group-rfc/docs/RFCS/20210215_task_groups.md#task-group-abstraction)
+and [inheritable goroutine
+IDs](https://github.com/knz/cockroach/blob/20210215-task-group-rfc/docs/RFCS/20210215_task_groups.md#task-group-abstraction).
+The proposal observes the need to accumulate timings (and other statistics, like
+memory usage) across a set of cooperating goroutines, and having the ability to
+retrieve goroutine timings from possibly another goroutine. It also segues into
+precise tracking of memory allocations and possibility of resource control
+within the runtime (we don’t), and helpfully outlines a few other options for
+resource attribution and control from counting CPU ticks, using separate
+processes, and manual instrumentation.
+
+Comparison:
+- Among the alternatives listed here this is the closest one in spirit,
+ differing in the specific runtime changes proposed (with library code
+ implications).
+ - Maintaining a modified runtime is logistically easier now compared to
+ when the proposal was authored given our Bazel investments.
+- Task groups push the cross-goroutine tracking logic into the runtime itself
+ which makes it a larger patch set to maintain, and less likely to be
+ upstreamed. This also has implications for reviewability. This proposal is a
+ more targeted form, lifting as much as possible out of the runtime and into
+ CRDB libraries.
+- Task groups let you read off counters before an "inherited" goroutine is done,
+ which could matter for very long-lived goroutines. With this proposal, if
+ we’re interested in accumulating counters for long-lived goroutines, the
+ goroutines would themselves have to retrieve their running time and maintain
+ external counters. Possible, but perhaps more awkward.
+- Task groups make use of atomics to maintain running counters as opposed to
+ uncontended per-`g` counters. This could be slower due to cache coherency
+ protocols across processors but we suspect the effects would be negligible.
+- At a high level, task groups are an opt-out form of timing tracking (a
+ "unrelated" goroutine spawned off would have to be excluded from accumulating
+ time into the parent task group). This proposal is more opt-in.
+
+### eBPF probes
+
+Linux has the ability to define
+[probes](https://www.brendangregg.com/blog/2015-06-28/linux-ftrace-uprobe.html)
+at specific callsites in user-level code (which for us would include the Go
+runtime). This document is a good primer on
+[uprobes](https://github.com/jav/systemtap/blob/master/runtime/uprobes/uprobes.txt)
+specifically. We can run eBPF programs every time the callsite is executed and
+maintain state across invocations. We could then, in theory, instantiate maps
+keyed by the goroutine ID and maintain running counters (last ran, total ran) in
+the same way we’re proposing above (`lastsched`, `runningnanos`) but without
+modifying the runtime itself. We’d probe explicit callsites, probably the same
+as above, where `g`’s change state and where they’re created and destroyed.
+
+An unrelated example of what eBPF could look like with CRDB (probing the Go
+scheduler to figure out how often it’s invoked, how often it sleeps, and a
+distribution of how long it does in so in nanoseconds):
+
+```
+sudo bpftrace -p 9593 -e '
+uprobe:/home/ubuntu/cockroach:runtime.schedule {
+ @in[tid] = 1;
+ @steal[tid] = 0;
+ @parkstart[tid] = 0;
+ @parkns[tid] = 0
+}
+uretprobe:/home/ubuntu/cockroach:runtime.runqsteal {
+ if (@in[tid] == 1 && @steal[tid] == 0 && retval != 0) {
+ @steal[tid] = 1
+ }
+}
+uprobe:/home/ubuntu/cockroach:runtime.notesleep {
+ if (@in[tid] == 1) {
+ @parkstart[tid] = nsecs
+ }
+}
+uretprobe:/home/ubuntu/cockroach:runtime.notesleep {
+ if (@in[tid] == 1) {
+ @parkns[tid] += (nsecs - @parkstart[tid]);
+ @parkstart[tid] = 0
+ }
+}
+uprobe:/home/ubuntu/cockroach:runtime.execute {
+ if (@in[tid] == 1) {
+ if (@steal[tid] == 1) { @steals += 1 }
+ if (@parkns[tid] > 1) {
+ @parks += 1;
+ @park_dur = hist(@parkns[tid]);
+ }
+ @schedules += 1;
+ @in[tid] = 0;
+ }
+}
+interval:s:1 {
+ printf("schedules/s %d, steals/s %d, parks/s %d \n", @schedules, @steals, @parks);
+ print(@park_dur);
+ clear(@steals);
+ clear(@schedules);
+ clear(@parks);
+ clear(@park_dur);
+}'
+```
+
+```
+schedules/s 3796, steals/s 18, parks/s 3450
+@park_dur:
+[4K, 8K) 4 | |
+[8K, 16K) 0 | |
+[16K, 32K) 4 | |
+[32K, 64K) 27 |@ |
+[64K, 128K) 98 |@@@@@ |
+[128K, 256K) 304 |@@@@@@@@@@@@@@@@ |
+[256K, 512K) 975 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ |
+[512K, 1M) 590 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ |
+[1M, 2M) 984 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
+[2M, 4M) 351 |@@@@@@@@@@@@@@@@@@ |
+[4M, 8M) 50 |@@ |
+[8M, 16M) 25 |@ |
+[16M, 32M) 27 |@ |
+[32M, 64M) 11 | |
+[64M, 128M) 4 |
+```
+
+Comparison:
+
+- Linux-only; processes accessing eBPF probes also need to run as root.
+- Does not necessitate runtime changes but does effectively need an
+ understanding of runtime internals.
+- Can be used to probe OS scheduling events, something the Go runtime itself
+ has no visibility into.
+- `uretprobes` don’t work in Go (see
+ [bcc#1320](https://github.com/iovisor/bcc/issues/1320) and
+ [go#22008](https://github.com/golang/go/issues/22008)) because of stack
+ rewriting probes do to trampoline off to eBPF programs – something the Go
+ runtime does not expect when growing/shrinking stacks. This presents as
+ panics (TODO: memory corruption claims).
+ - There are ways to
+ [simulate](https://github.com/iovisor/bcc/issues/1320#issuecomment-407927542)
+ `uretprobes` through `uprobes` but it’s more annoying to do; we’d need to
+ decode architecture-specific instructions and ensure coverage for all
+ kinds of return instructions on each architecture.
+- eBPF probes have more overhead than tracking running nanos in the runtime
+ itself (instrumented function calls take in the order of µs as opposed to
+ ns). Also, we’d still have the vDSO call in our eBPF program.
+- Recent
+ [developments](https://docs.px.dev/tutorials/custom-data/dynamic-go-logging/)
+ in the eBPF + Go ecosystem might make things progressively less painful
+ (libraries to read arguments from the Go stack, decoding Go structs).
+ - The out-of-the-box `bpftrace` use of [thread
+ IDs](https://www.brendangregg.com/BPF/bpftrace-cheat-sheet.html) (`tid`
+ in the snippet above) don’t apply to Go, we need glue work to
+ [access](https://github.com/surki/misc/blob/c343525e35a96497dd356c38921f25b22c77fcc9/go.stp#L5-L21)
+ goroutine IDs.
+
+### Admission control slot times
+
+CPU bound work (for e.g. KV-level request and SQL statement processing) gets
+queued through per-node [admission
+queues](https://docs.google.com/document/d/1x4DxbOjwCK-zrfO0amFNfHUI5Wul6V_IdIG7QKC5yqQ/edit#heading=h.mal9ypa8z9pq)
+to provide overload protection and fairness. Queued work proceeds once granted
+a slot, and we have visibility into when work for a given slot starts and ends.
+These wall time readings could serve as a proxy for CPU processing the work
+entailed; timings we could then use for per-store attribution to specific
+ranges or tenants. The
+[inaccuracies](https://github.com/cockroachdb/cockroach/issues/75066#issuecomment-1023407169)
+here stem from wait times in:
+
+- the Go scheduler after work is slotted (though this is bounded by admission
+ control, and affects all range/tenant-scoped work uniformly);
+- txn contention handling (something we can instrument and subtract by timing
+ things within our concurrency control libraries);
+- I/O wait times.
+
+Considerations:
+- Does not necessitate runtime changes.
+- Proxy for actual CPU time, but perhaps good enough for use cases we imagine
+ needing accurate/precise CPU modeling for.
+- Usable for work that makes sense to enqueue through admission control
+ (unclear if it applies to all subsystems where CPU attribution could be
+ used).
+
+---
+
+[disk-util]: Unlike CPU, modeling capacity utilization for disks feels
+ generally difficult. This is doubly so in virtualized environments where it's
+ less clear how the I/O queue depths relate to the actual underlying disks, or
+ how the disk bandwidth/IOPS observable in the VM through OS counters relate
+ to what the VMs are provisioned for. We have and want to continue developing
+ models of disk utilization for other reasons (admission control, dynamic
+ snapshot rates, allocation, capacity aware pebble compactions, reducing
+ foreground impact by background process IO), independent of the CPU modeling
+ this document proposes.
+
+[release-go]: The official Go archives are built using the steps
+ [here](https://go.googlesource.com/build/+/refs/heads/master/cmd/release/release.go).
diff --git a/docs/RFCS/20220602_fine_grained_cpu_attribution/index-nanos.png b/docs/RFCS/20220602_fine_grained_cpu_attribution/index-nanos.png
new file mode 100644
index 000000000000..73788653b330
Binary files /dev/null and b/docs/RFCS/20220602_fine_grained_cpu_attribution/index-nanos.png differ
diff --git a/docs/RFCS/20220602_fine_grained_cpu_attribution/pprof.png b/docs/RFCS/20220602_fine_grained_cpu_attribution/pprof.png
new file mode 100644
index 000000000000..101cfcf6ee18
Binary files /dev/null and b/docs/RFCS/20220602_fine_grained_cpu_attribution/pprof.png differ
diff --git a/docs/RFCS/20220602_fine_grained_cpu_attribution/vtables.png b/docs/RFCS/20220602_fine_grained_cpu_attribution/vtables.png
new file mode 100644
index 000000000000..2faad446a80b
Binary files /dev/null and b/docs/RFCS/20220602_fine_grained_cpu_attribution/vtables.png differ
diff --git a/pkg/geo/geos/geos.go b/pkg/geo/geos/geos.go
index 2ba6bac0240a..03761d8e3682 100644
--- a/pkg/geo/geos/geos.go
+++ b/pkg/geo/geos/geos.go
@@ -227,10 +227,13 @@ func wrapGEOSInitError(err error) error {
case "windows":
page = "windows"
}
- return errors.WithHintf(
- err,
- "Ensure you have the spatial libraries installed as per the instructions in %s",
- docs.URL("install-cockroachdb-"+page),
+ return pgerror.WithCandidateCode(
+ errors.WithHintf(
+ err,
+ "Ensure you have the spatial libraries installed as per the instructions in %s",
+ docs.URL("install-cockroachdb-"+page),
+ ),
+ pgcode.ConfigFile,
)
}