From 007f739833a2e198f228772ccc164de13842b4f7 Mon Sep 17 00:00:00 2001 From: Aditya Maru Date: Mon, 27 Jun 2022 09:07:05 -0400 Subject: [PATCH 1/5] tracing: remove deprecated log record field This change deletes the deprecated field in `LogRecord` that was kept around for configurability with 21.2 nodes. This field was duplicative in the string and JSON output and since we will only support running a mixed version with 22.1 nodes, it is safe to remove. Release note: None --- pkg/server/node_tenant.go | 21 ------------------- pkg/util/tracing/crdbspan.go | 4 ---- pkg/util/tracing/tracingpb/recorded_span.go | 11 ---------- .../tracing/tracingpb/recorded_span.proto | 6 ++---- 4 files changed, 2 insertions(+), 40 deletions(-) diff --git a/pkg/server/node_tenant.go b/pkg/server/node_tenant.go index 139e1c565e52..0f6a7760c004 100644 --- a/pkg/server/node_tenant.go +++ b/pkg/server/node_tenant.go @@ -43,27 +43,6 @@ func redactRecordingForTenant(tenID roachpb.TenantID, rec tracingpb.Recording) e "recording has non-redactable span with the Message field set: %s", sp) } record.Message = record.Message.Redact() - - // For compatibility with old versions, also redact DeprecatedFields. - for k := range record.DeprecatedFields { - field := &record.DeprecatedFields[k] - if field.Key != tracingpb.LogMessageField { - // We don't have any of these fields, but let's not take any - // chances (our dependencies might slip them in). - field.Value = TraceRedactedMarker - continue - } - if !sp.RedactableLogs { - // If we're handling a span that originated from an (early patch - // release) 22.1 node, all the containing information will be - // stripped. Note that this is not the common path here, as most - // information in the trace will be from the local node, which - // always creates redactable logs. - field.Value = TraceRedactedMarker - continue - } - field.Value = field.Value.Redact() - } } } return nil diff --git a/pkg/util/tracing/crdbspan.go b/pkg/util/tracing/crdbspan.go index a4d810a4fe50..425c8678fb4b 100644 --- a/pkg/util/tracing/crdbspan.go +++ b/pkg/util/tracing/crdbspan.go @@ -678,10 +678,6 @@ func (s *crdbSpan) record(msg redact.RedactableString) { logRecord := &tracingpb.LogRecord{ Time: now, Message: msg, - // Compatibility with 21.2. - DeprecatedFields: []tracingpb.LogRecord_Field{ - {Key: tracingpb.LogMessageField, Value: msg}, - }, } s.recordInternal(logRecord, &s.mu.recording.logs) diff --git a/pkg/util/tracing/tracingpb/recorded_span.go b/pkg/util/tracing/tracingpb/recorded_span.go index 1c43d0ac86a4..cacd329024b5 100644 --- a/pkg/util/tracing/tracingpb/recorded_span.go +++ b/pkg/util/tracing/tracingpb/recorded_span.go @@ -82,17 +82,6 @@ func (l LogRecord) Msg() redact.RedactableString { if l.Message != "" { return l.Message } - - // Compatibility with 21.2: look at l.DeprecatedFields. - for _, f := range l.DeprecatedFields { - key := f.Key - if key == LogMessageField { - return f.Value - } - if key == "error" { - return redact.Sprintf("error: %s", f.Value) - } - } return "" } diff --git a/pkg/util/tracing/tracingpb/recorded_span.proto b/pkg/util/tracing/tracingpb/recorded_span.proto index 86bd64d34f13..3ec582670cfe 100644 --- a/pkg/util/tracing/tracingpb/recorded_span.proto +++ b/pkg/util/tracing/tracingpb/recorded_span.proto @@ -27,12 +27,10 @@ message LogRecord { string key = 1; string value = 2 [(gogoproto.nullable) = false, (gogoproto.customtype) = "github.com/cockroachdb/redact.RedactableString"]; } - // Fields with values converted to strings. In 22.1, the `message` field - // contains the log message, and this field is only used for compatibility - // with 21.2 nodes. - repeated Field deprecated_fields = 2 [(gogoproto.nullable) = false]; // The log message. string message = 3 [(gogoproto.nullable) = false, (gogoproto.customtype) = "github.com/cockroachdb/redact.RedactableString"]; + + reserved 2; } // StructuredRecord is a structured message recorded in a traced span. From ab6d45e57f3a658f108f6c6e5871bdc08b543ce8 Mon Sep 17 00:00:00 2001 From: Aayush Shah Date: Tue, 29 Mar 2022 13:56:02 -0400 Subject: [PATCH 2/5] kvserver: check GC threshold after acquiring a storage snapshot Previously, we would check if a given read could proceed with execution _before_ we acquired a snapshot of the storage engine state. In particular, we would check the replica's in-memory GC threshold before the state of the engine had been pinned. This meant that the following scenario was possible: 1. Request comes in, checks the replica's in-memory GC threshold and determines that it is safe to proceed. 2. A pair of GC requests bump the GC threshold and garbage-collect the expired data. 3. The read acquires a snapshot of the storage engine state. 4. The read continues with its execution and sees an incorrect result. This commit makes it such that we now check the GC threshold after we acquire a snapshot of the storage engine state. It does so by lifting the GC threshold check out of `checkExecutionCanProceed()` and splitting up the single critical section under `Replica.mu` into two. NB: Note that this commit only fixes our current model of issuing `GCRequest`s -- which is that we first issue a GCRequest that simply bumps the GC threshold and then issue another GCRequest that actually performs the garbage collection. If a single GCRequest were to do both of these things, we'd still have an issue with reads potentially seeing incorrect results since, currently, the in-memory GC threshold is bumped as a "post-apply" side effect that takes effect after the expired data has already been garbage collected. This will be handled in a future patch. Release note: none --- pkg/kv/kvserver/replica.go | 68 ++++++++++++++----- pkg/kv/kvserver/replica_follower_read_test.go | 2 +- pkg/kv/kvserver/replica_raft.go | 3 +- pkg/kv/kvserver/replica_read.go | 5 +- pkg/kv/kvserver/replica_send.go | 2 +- pkg/kv/kvserver/replica_write.go | 7 +- pkg/kv/kvserver/store_test.go | 2 +- 7 files changed, 65 insertions(+), 24 deletions(-) diff --git a/pkg/kv/kvserver/replica.go b/pkg/kv/kvserver/replica.go index f40c69bb8d79..c938fbcbd966 100644 --- a/pkg/kv/kvserver/replica.go +++ b/pkg/kv/kvserver/replica.go @@ -1345,10 +1345,12 @@ func (r *Replica) assertStateRaftMuLockedReplicaMuRLocked( // TODO(nvanbenschoten): move the following 5 methods to replica_send.go. -// checkExecutionCanProceed returns an error if a batch request cannot be -// executed by the Replica. An error indicates that the Replica is not live and -// able to serve traffic or that the request is not compatible with the state of -// the Range due to the range's key bounds, the range's lease, the range's GC +// checkExecutionCanProceedBeforeStorageSnapshot returns an error if a batch +// request cannot be executed by the Replica. For read-only requests, the method +// is called before the state of the storage engine is pinned (via an iterator +// or a snapshot). An error indicates that the Replica is not live and able to +// serve traffic or that the request is not compatible with the state of the +// Range due to the range's key bounds, the range's lease, the range's GC // threshold, or due to a pending merge. On success, returns nil and either a // zero LeaseStatus (indicating that the request was permitted to skip the lease // checks) or a LeaseStatus in LeaseState_VALID (indicating that the Replica is @@ -1359,7 +1361,7 @@ func (r *Replica) assertStateRaftMuLockedReplicaMuRLocked( // will not wait for a pending merge to conclude before proceeding. Callers // might be ok with this if they know that they will end up checking for a // pending merge at some later time. -func (r *Replica) checkExecutionCanProceed( +func (r *Replica) checkExecutionCanProceedBeforeStorageSnapshot( ctx context.Context, ba *roachpb.BatchRequest, g *concurrency.Guard, ) (kvserverpb.LeaseStatus, error) { rSpan, err := keys.Range(ba.Requests) @@ -1398,7 +1400,7 @@ func (r *Replica) checkExecutionCanProceed( return kvserverpb.LeaseStatus{}, err } - st, shouldExtend, err := r.checkGCThresholdAndLeaseRLocked(ctx, ba) + st, shouldExtend, err := r.checkLeaseRLocked(ctx, ba) if err != nil { return kvserverpb.LeaseStatus{}, err } @@ -1432,13 +1434,54 @@ func (r *Replica) checkExecutionCanProceed( return st, nil } -// checkGCThresholdAndLeaseRLocked checks the provided batch against the GC +// checkExecutionCanProceedAfterStorageSnapshot returns an error if a batch +// request cannot be executed by the Replica. For read-only requests, this +// method is called after the state of the storage engine is pinned via an +// iterator. An error indicates that the request's timestamp is below the +// Replica's GC threshold. +func (r *Replica) checkExecutionCanProceedAfterStorageSnapshot( + ba *roachpb.BatchRequest, st kvserverpb.LeaseStatus, +) error { + r.mu.RLock() + defer r.mu.RUnlock() + // NB: For read-only requests, the GC threshold check is performed after the + // state of the storage engine has been pinned by the iterator. This is + // because GC requests don't acquire latches at the timestamp they are garbage + // collecting, so read traffic at / around the GC threshold will not be + // serialized with GC requests. Thus, we must check the in-memory GC threshold + // after we pin the state of the storage engine [1]. + // + // [1]: This relies on the invariant that the in-memory GC threshold is bumped + // _before_ the actual garbage collection happens. + // + // TODO(aayush): The above description intentionally omits some details, as + // they are going to be changed as part of + // https://github.com/cockroachdb/cockroach/issues/55293. + return r.checkTSAboveGCThresholdRLocked(ba.EarliestActiveTimestamp(), st, ba.IsAdmin()) +} + +// checkExecutionCanProceedRWOrAdmin returns an error if a batch request going +// through the RW or admin paths cannot be executed by the Replica. +func (r *Replica) checkExecutionCanProceedRWOrAdmin( + ctx context.Context, ba *roachpb.BatchRequest, g *concurrency.Guard, +) (kvserverpb.LeaseStatus, error) { + st, err := r.checkExecutionCanProceedBeforeStorageSnapshot(ctx, ba, g) + if err != nil { + return kvserverpb.LeaseStatus{}, err + } + if err := r.checkExecutionCanProceedAfterStorageSnapshot(ba, st); err != nil { + return kvserverpb.LeaseStatus{}, err + } + return st, nil +} + +// checkLeaseRLocked checks the provided batch against the GC // threshold and lease. A nil error indicates to go ahead with the batch, and // is accompanied either by a valid or zero lease status, the latter case // indicating that the request was permitted to bypass the lease check. The // returned bool indicates whether the lease should be extended (only on nil // error). -func (r *Replica) checkGCThresholdAndLeaseRLocked( +func (r *Replica) checkLeaseRLocked( ctx context.Context, ba *roachpb.BatchRequest, ) (kvserverpb.LeaseStatus, bool, error) { now := r.Clock().NowAsClockTimestamp() @@ -1474,7 +1517,7 @@ func (r *Replica) checkGCThresholdAndLeaseRLocked( } // Otherwise, suppress the error. Also, remember that we're not serving // this under the lease by zeroing out the status. We also intentionally - // do not pass the original status to checkTSAboveGCThresholdRLocked as + // do not pass the original status to checkTSAboveGCThreshold as // this method assumes that a valid status indicates that this replica // holds the lease (see #73123). `shouldExtend` is already false in this // branch, but for completeness we zero it out as well. @@ -1482,13 +1525,6 @@ func (r *Replica) checkGCThresholdAndLeaseRLocked( } } - // Check if request is below the GC threshold and if so, error out. Note that - // this uses the lease status no matter whether it's valid or not, and the - // method is set up to handle that. - if err := r.checkTSAboveGCThresholdRLocked(ba.EarliestActiveTimestamp(), st, ba.IsAdmin()); err != nil { - return kvserverpb.LeaseStatus{}, false, err - } - return st, shouldExtend, nil } diff --git a/pkg/kv/kvserver/replica_follower_read_test.go b/pkg/kv/kvserver/replica_follower_read_test.go index faebb2b29e82..a13d0c08e621 100644 --- a/pkg/kv/kvserver/replica_follower_read_test.go +++ b/pkg/kv/kvserver/replica_follower_read_test.go @@ -174,7 +174,7 @@ func TestCheckExecutionCanProceedAllowsFollowerReadWithInvalidLease(t *testing.T } ba.Add(&gArgs) - ls, err := r.checkExecutionCanProceed(ctx, ba, nil /* g */) + ls, err := r.checkExecutionCanProceedBeforeStorageSnapshot(ctx, ba, nil /* g */) require.NoError(t, err) require.Empty(t, ls) diff --git a/pkg/kv/kvserver/replica_raft.go b/pkg/kv/kvserver/replica_raft.go index 09bfa7a587da..c28ef4fbb44d 100644 --- a/pkg/kv/kvserver/replica_raft.go +++ b/pkg/kv/kvserver/replica_raft.go @@ -215,7 +215,8 @@ func (r *Replica) evalAndPropose( proposal.command.ProposerLeaseSequence = seq } else if !st.Lease.OwnedBy(r.store.StoreID()) { // Perform a sanity check that the lease is owned by this replica. This must - // have been ascertained by the callers in checkExecutionCanProceed. + // have been ascertained by the callers in + // checkExecutionCanProceedBeforeStorageSnapshot. log.Fatalf(ctx, "cannot propose %s on follower with remotely owned lease %s", ba, st.Lease) } else { proposal.command.ProposerLeaseSequence = st.Lease.Sequence diff --git a/pkg/kv/kvserver/replica_read.go b/pkg/kv/kvserver/replica_read.go index 7df6a5c6ac8e..fc8e65cd0688 100644 --- a/pkg/kv/kvserver/replica_read.go +++ b/pkg/kv/kvserver/replica_read.go @@ -41,7 +41,7 @@ func (r *Replica) executeReadOnlyBatch( defer r.readOnlyCmdMu.RUnlock() // Verify that the batch can be executed. - st, err := r.checkExecutionCanProceed(ctx, ba, g) + st, err := r.checkExecutionCanProceedBeforeStorageSnapshot(ctx, ba, g) if err != nil { return nil, g, roachpb.NewError(err) } @@ -74,6 +74,9 @@ func (r *Replica) executeReadOnlyBatch( } defer rw.Close() + if err := r.checkExecutionCanProceedAfterStorageSnapshot(ba, st); err != nil { + return nil, g, roachpb.NewError(err) + } // TODO(nvanbenschoten): once all replicated intents are pulled into the // concurrency manager's lock-table, we can be sure that if we reached this // point, we will not conflict with any of them during evaluation. This in diff --git a/pkg/kv/kvserver/replica_send.go b/pkg/kv/kvserver/replica_send.go index 4ac84feccb7b..e50115bf1ae2 100644 --- a/pkg/kv/kvserver/replica_send.go +++ b/pkg/kv/kvserver/replica_send.go @@ -891,7 +891,7 @@ func (r *Replica) executeAdminBatch( return nil, roachpb.NewError(err) } - _, err := r.checkExecutionCanProceed(ctx, ba, nil /* g */) + _, err := r.checkExecutionCanProceedRWOrAdmin(ctx, ba, nil /* g */) if err == nil { err = r.signallerForBatch(ba).Err() } diff --git a/pkg/kv/kvserver/replica_write.go b/pkg/kv/kvserver/replica_write.go index 6de60bf406b3..7a3333541134 100644 --- a/pkg/kv/kvserver/replica_write.go +++ b/pkg/kv/kvserver/replica_write.go @@ -98,13 +98,14 @@ func (r *Replica) executeWriteBatch( }() // Verify that the batch can be executed. - st, err := r.checkExecutionCanProceed(ctx, ba, g) + st, err := r.checkExecutionCanProceedRWOrAdmin(ctx, ba, g) if err != nil { return nil, g, roachpb.NewError(err) } - // Check the breaker. Note that we do this after checkExecutionCanProceed, - // so that NotLeaseholderError has precedence. + // Check the breaker. Note that we do this after + // checkExecutionCanProceedBeforeStorageSnapshot, so that NotLeaseholderError + // has precedence. if err := r.signallerForBatch(ba).Err(); err != nil { return nil, g, roachpb.NewError(err) } diff --git a/pkg/kv/kvserver/store_test.go b/pkg/kv/kvserver/store_test.go index 1671b0bd0115..72a68e9cf3ca 100644 --- a/pkg/kv/kvserver/store_test.go +++ b/pkg/kv/kvserver/store_test.go @@ -674,7 +674,7 @@ func TestStoreRemoveReplicaDestroy(t *testing.T) { t.Fatal("replica was not marked as destroyed") } - if _, err = repl1.checkExecutionCanProceed(ctx, &roachpb.BatchRequest{}, nil /* g */); !errors.Is(err, expErr) { + if _, err = repl1.checkExecutionCanProceedBeforeStorageSnapshot(ctx, &roachpb.BatchRequest{}, nil /* g */); !errors.Is(err, expErr) { t.Fatalf("expected error %s, but got %v", expErr, err) } } From efa68b9ee60eb4dec1cf063a8de142030314465e Mon Sep 17 00:00:00 2001 From: Aayush Shah Date: Mon, 28 Mar 2022 17:17:19 -0400 Subject: [PATCH 3/5] kvserver: unskip the `thresholdFirst=true` variants of `TestGCThresholdRacesWithRead` This commit unskips a subset of `TestGCThresholdRacesWithRead`, which is now possible because of #76312 and the first commit in this patch. See https://github.com/cockroachdb/cockroach/pull/76410#issuecomment-104101938 Relates to https://github.com/cockroachdb/cockroach/issues/55293. Release note: none --- pkg/kv/kvserver/replica_test.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/kv/kvserver/replica_test.go b/pkg/kv/kvserver/replica_test.go index a6a0063110b4..510c21fa5634 100644 --- a/pkg/kv/kvserver/replica_test.go +++ b/pkg/kv/kvserver/replica_test.go @@ -8572,7 +8572,6 @@ func TestRefreshFromBelowGCThreshold(t *testing.T) { func TestGCThresholdRacesWithRead(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) - skip.WithIssue(t, 55293) testutils.RunTrueAndFalse(t, "followerRead", func(t *testing.T, followerRead bool) { testutils.RunTrueAndFalse(t, "thresholdFirst", func(t *testing.T, thresholdFirst bool) { From 500b57a8ef7fbaad74f129a95a7182c22d0c99df Mon Sep 17 00:00:00 2001 From: Aditya Maru Date: Sat, 25 Jun 2022 08:24:41 -0400 Subject: [PATCH 4/5] tracing: add ChildrenMetadata to String and JSON traces This change teaches the Recording stringer about the ChildrenMetadata map, that contains per operation metadata about the spans' children. This change also adds ChildrenMetadata to the JSON view of a recording so that it shows up in the `trace.json` file that is included in a statement bundle. Informs: #80391 Release note: a trace recording will now output an additional field per tracing span that corresponds to metadata bucketed by operation name of the spans' children. --- pkg/util/tracing/span_test.go | 8 +++- pkg/util/tracing/test_utils.go | 41 +++++++++++++++++-- pkg/util/tracing/tracingpb/BUILD.bazel | 1 + pkg/util/tracing/tracingpb/recorded_span.go | 16 ++++++++ .../tracing/tracingpb/recorded_span.proto | 3 ++ pkg/util/tracing/tracingpb/recording.go | 23 +++++++++++ pkg/util/tracing/utils.go | 1 + 7 files changed, 88 insertions(+), 5 deletions(-) diff --git a/pkg/util/tracing/span_test.go b/pkg/util/tracing/span_test.go index 6f9755e3bde5..a607ba6d6d22 100644 --- a/pkg/util/tracing/span_test.go +++ b/pkg/util/tracing/span_test.go @@ -105,6 +105,8 @@ func TestRecordingString(t *testing.T) { require.NoError(t, CheckRecording(rec, ` === operation:root _verbose:1 + [remote child] + [local child] event:root 1 === operation:remote child _verbose:1 event:remote child 1 @@ -124,7 +126,7 @@ func TestRecordingString(t *testing.T) { timeSincePrev: "0.000ms", text: "=== operation:root _verbose:1", }, l) - l, err = parseLine(lines[1]) + l, err = parseLine(lines[3]) require.Equal(t, traceLine{ timeSinceTraceStart: "1.000ms", timeSincePrev: "1.000ms", @@ -183,6 +185,7 @@ func TestRecordingInRecording(t *testing.T) { require.NoError(t, CheckRecording(childRec, ` === operation:child _verbose:1 + [grandchild] === operation:grandchild _verbose:1 `)) } @@ -208,6 +211,7 @@ func TestImportRemoteRecording(t *testing.T) { if verbose { require.NoError(t, CheckRecording(sp.FinishAndGetRecording(tracingpb.RecordingVerbose), ` === operation:root _verbose:1 + [child] === operation:child _verbose:1 event:&Int32Value{Value:4,XXX_unrecognized:[],} event:foo @@ -216,6 +220,7 @@ func TestImportRemoteRecording(t *testing.T) { } else { require.NoError(t, CheckRecording(sp.FinishAndGetRecording(tracingpb.RecordingStructured), ` === operation:root + [child] structured:{"@type":"type.googleapis.com/google.protobuf.Int32Value","value":4} `)) } @@ -844,6 +849,7 @@ func TestOpenChildIncludedRecording(t *testing.T) { rec := parent.FinishAndGetRecording(tracingpb.RecordingVerbose) require.NoError(t, CheckRecording(rec, ` === operation:parent _verbose:1 + [child] === operation:child _unfinished:1 _verbose:1 `)) child.Finish() diff --git a/pkg/util/tracing/test_utils.go b/pkg/util/tracing/test_utils.go index f65f9045e51a..66f11c9be411 100644 --- a/pkg/util/tracing/test_utils.go +++ b/pkg/util/tracing/test_utils.go @@ -15,6 +15,7 @@ import ( "regexp" "sort" "strings" + "time" "github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb" "github.com/cockroachdb/errors" @@ -165,6 +166,7 @@ func CheckRecordedSpans(rec tracingpb.Recording, expected string) error { // // if err := CheckRecording(sp.GetRecording(), ` // === operation:root +// [childrenMetadata] // event:root 1 // === operation:remote child // event:remote child 1 @@ -186,19 +188,50 @@ func CheckRecording(rec tracingpb.Recording, expected string) error { // After | "event:root 1" re := regexp.MustCompile(`.*s.*s\s{4}`) rec = string(re.ReplaceAll([]byte(rec), nil)) - // 4. Change all tabs to four spaces. + // 4. Strip out all the metadata from each ChildrenMetadata entry. + // + // Before | [operation: {Count:, Duration:}] + // After | [operation] + re = regexp.MustCompile(`:.*]`) + rec = string(re.ReplaceAll([]byte(rec), []byte("]"))) + // 5. Change all tabs to four spaces. rec = strings.ReplaceAll(rec, "\t", " ") - // 5. Compute the outermost indentation. + // 6. Compute the outermost indentation. indent := strings.Repeat(" ", len(rec)-len(strings.TrimLeft(rec, " "))) - // 6. Outdent each line by that amount. + // 7. Outdent each line by that amount. var lines []string for _, line := range strings.Split(rec, "\n") { lines = append(lines, strings.TrimPrefix(line, indent)) } - // 6. Stitch everything together. + // 8. Stitch everything together. return strings.Join(lines, "\n") } + sortChildrenMetadataByName := func(m map[string]tracingpb.OperationMetadata) { + // Sort the OperationMetadata of s' children alphabetically. + childrenMetadata := make([]tracingpb.OperationAndMetadata, 0, len(m)) + for operation, metadata := range m { + childrenMetadata = append(childrenMetadata, + tracingpb.OperationAndMetadata{Operation: operation, Metadata: metadata}) + } + sort.Slice(childrenMetadata, func(i, j int) bool { + return childrenMetadata[i].Operation < childrenMetadata[j].Operation + }) + + for i, cm := range childrenMetadata { + metadata := m[cm.Operation] + metadata.Duration = time.Duration(float64(i) * time.Second.Seconds()) + m[cm.Operation] = metadata + } + } + + // ChildrenMetadata are sorted in descending order of duration when returned. + // To ensure a stable sort in tests, we set the durations to sort in an + // alphabetical descending order. + for i := range rec { + sortChildrenMetadataByName(rec[i].ChildrenMetadata) + } + exp := normalize(expected) got := normalize(rec.String()) if got != exp { diff --git a/pkg/util/tracing/tracingpb/BUILD.bazel b/pkg/util/tracing/tracingpb/BUILD.bazel index 9e73a6eb3ef4..82051fb008e7 100644 --- a/pkg/util/tracing/tracingpb/BUILD.bazel +++ b/pkg/util/tracing/tracingpb/BUILD.bazel @@ -13,6 +13,7 @@ go_library( importpath = "github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb", visibility = ["//visibility:public"], deps = [ + "//pkg/util/humanizeutil", "//pkg/util/protoutil", "@com_github_cockroachdb_errors//:errors", "@com_github_cockroachdb_redact//:redact", diff --git a/pkg/util/tracing/tracingpb/recorded_span.go b/pkg/util/tracing/tracingpb/recorded_span.go index cacd329024b5..8613e7e8e009 100644 --- a/pkg/util/tracing/tracingpb/recorded_span.go +++ b/pkg/util/tracing/tracingpb/recorded_span.go @@ -15,6 +15,7 @@ import ( "strings" "time" + "github.com/cockroachdb/cockroach/pkg/util/humanizeutil" "github.com/cockroachdb/redact" types "github.com/gogo/protobuf/types" ) @@ -117,3 +118,18 @@ func (m OperationMetadata) Combine(other OperationMetadata) OperationMetadata { m.Duration += other.Duration return m } + +var _ redact.SafeFormatter = OperationMetadata{} + +func (m OperationMetadata) String() string { + return redact.StringWithoutMarkers(m) +} + +// SafeFormat implements redact.SafeFormatter. +func (m OperationMetadata) SafeFormat(s redact.SafePrinter, _ rune) { + s.Printf("{count: %d, duration %s", m.Count, humanizeutil.Duration(m.Duration)) + if m.ContainsUnfinished { + s.Printf(", unfinished") + } + s.Print("}") +} diff --git a/pkg/util/tracing/tracingpb/recorded_span.proto b/pkg/util/tracing/tracingpb/recorded_span.proto index 3ec582670cfe..1cd832c3cf59 100644 --- a/pkg/util/tracing/tracingpb/recorded_span.proto +++ b/pkg/util/tracing/tracingpb/recorded_span.proto @@ -44,6 +44,8 @@ message StructuredRecord { // OperationMetadata captures information corresponding to the operation that // a span is started with. message OperationMetadata { + option (gogoproto.goproto_stringer) = false; + // Duration represents the total time spent by spans tracing the operation. int64 duration = 1 [(gogoproto.casttype) = "time.Duration"]; // Count represents the number of spans tracing the operation. @@ -177,5 +179,6 @@ message NormalizedSpan { (gogoproto.stdduration) = true]; repeated LogRecord logs = 5 [(gogoproto.nullable) = false]; repeated StructuredRecord structured_records = 7 [(gogoproto.nullable) = false]; + map children_metadata = 9 [(gogoproto.nullable) = false]; repeated NormalizedSpan children = 6 [(gogoproto.nullable) = false]; } diff --git a/pkg/util/tracing/tracingpb/recording.go b/pkg/util/tracing/tracingpb/recording.go index f16e3ea61df7..ab713003fe08 100644 --- a/pkg/util/tracing/tracingpb/recording.go +++ b/pkg/util/tracing/tracingpb/recording.go @@ -14,6 +14,7 @@ import ( "encoding/json" "fmt" "regexp" + "sort" "strconv" "strings" "time" @@ -225,6 +226,13 @@ func (r Recording) FindSpan(operation string) (RecordedSpan, bool) { return RecordedSpan{}, false } +// OperationAndMetadata contains information about a tracing span operation and +// its corresponding metadata. +type OperationAndMetadata struct { + Operation string + Metadata OperationMetadata +} + // visitSpan returns the log messages for sp, and all of sp's children. // // All messages from a Span are kept together. Sibling spans are ordered within @@ -271,6 +279,21 @@ func (r Recording) visitSpan(sp RecordedSpan, depth int) []traceLogData { // ref - this entries timeSincePrev will be computed when we merge it into the parent time.Time{})) + // Sort the OperationMetadata of s' children in descending order of duration. + childrenMetadata := make([]OperationAndMetadata, 0, len(sp.ChildrenMetadata)) + for operation, metadata := range sp.ChildrenMetadata { + childrenMetadata = append(childrenMetadata, + OperationAndMetadata{operation, metadata}) + } + sort.Slice(childrenMetadata, func(i, j int) bool { + return childrenMetadata[i].Metadata.Duration > childrenMetadata[j].Metadata.Duration + }) + for _, c := range childrenMetadata { + var sb redact.StringBuilder + sb.Printf("[%s: %s]", redact.SafeString(c.Operation), c.Metadata.String()) + ownLogs = append(ownLogs, conv(sb.RedactableString(), sp.StartTime, time.Time{})) + } + for _, l := range sp.Logs { lastLog := ownLogs[len(ownLogs)-1] var sb redact.StringBuilder diff --git a/pkg/util/tracing/utils.go b/pkg/util/tracing/utils.go index 20a31bf2de5e..ca6f02ac32e3 100644 --- a/pkg/util/tracing/utils.go +++ b/pkg/util/tracing/utils.go @@ -40,6 +40,7 @@ func normalizeSpan(s tracingpb.RecordedSpan, trace tracingpb.Recording) tracingp n.TagGroups = s.TagGroups n.Logs = s.Logs n.StructuredRecords = s.StructuredRecords + n.ChildrenMetadata = s.ChildrenMetadata for _, ss := range trace { if ss.ParentSpanID != s.SpanID { From 240870ad270eebed8da698bdbfabf61a13f655c2 Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Thu, 30 Jun 2022 13:50:38 -0400 Subject: [PATCH 5/5] build: allow overriding TESTS for roachtest CI job This allows us to invoke the roachtest CI jobs with an overridden TESTS variable. Release note: None --- build/teamcity/util/roachtest_util.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/teamcity/util/roachtest_util.sh b/build/teamcity/util/roachtest_util.sh index f6dab8f6b59e..5b71504c3a2c 100644 --- a/build/teamcity/util/roachtest_util.sh +++ b/build/teamcity/util/roachtest_util.sh @@ -56,7 +56,7 @@ trap upload_stats EXIT # Set up the parameters for the roachtest invocation. PARALLELISM=16 CPUQUOTA=1024 -TESTS="" +TESTS="${TESTS-}" case "${CLOUD}" in gce) ;;