Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
136705: ui: add metric assertions to overview page e2e test r=kyle-a-wong a=kyle-a-wong

Adds assertions to the Overview page to make sure that data from nodes_ui is properly fetched and rendered on the page.

Epic: None
Release note: None

136859: sql: reduce allocations in `NewPlanningCtxWithOracle` r=mgartner a=mgartner

Epic: None

Release note: None

136899: sql/schemachanger: log schema change job latency in structured log r=rafiss a=rafiss

part of cockroachdb#134326
Release note (ops change): When a schema change job is completed, rolls back, or encounteres a failure, the time taken since the job began is now logged in a structured log in the SQL_SCHEMA log channel.

Co-authored-by: Kyle Wong <[email protected]>
Co-authored-by: Marcus Gartner <[email protected]>
Co-authored-by: Rafi Shamim <[email protected]>
  • Loading branch information
4 people committed Dec 6, 2024
4 parents db133be + 66aea1c + c47ef40 + 2b421ea commit 45aae10
Show file tree
Hide file tree
Showing 15 changed files with 135 additions and 34 deletions.
7 changes: 7 additions & 0 deletions docs/generated/eventlog.md
Original file line number Diff line number Diff line change
Expand Up @@ -1686,6 +1686,9 @@ An event of type `finish_schema_change` is recorded when a previously initiated
change has completed.


| Field | Description | Sensitive |
|--|--|--|
| `LatencyNanos` | The amount of time the schema change job took to complete. | no |


#### Common fields
Expand All @@ -1704,6 +1707,9 @@ An event of type `finish_schema_change_rollback` is recorded when a previously
initiated schema change rollback has completed.


| Field | Description | Sensitive |
|--|--|--|
| `LatencyNanos` | The amount of time the schema change job took to rollback. | no |


#### Common fields
Expand Down Expand Up @@ -1868,6 +1874,7 @@ encounters a problem and is reversed.
|--|--|--|
| `Error` | The error encountered that caused the schema change to be reversed. The specific format of the error is variable and can change across releases without warning. | yes |
| `SQLSTATE` | The SQLSTATE code for the error. | no |
| `LatencyNanos` | The amount of time the schema change job took before being reverted. | no |


#### Common fields
Expand Down
17 changes: 12 additions & 5 deletions pkg/sql/distsql_physical_planner.go
Original file line number Diff line number Diff line change
Expand Up @@ -918,6 +918,9 @@ type PlanningCtx struct {
// local flow finished running and is being cleaned up. It allows us to
// release the resources that are acquired during the physical planning and
// are being held onto throughout the whole flow lifecycle.
// Note that these functions might not be called in some cases (when
// DistSQLPlanner.Run is not called), but that is ok since on the main query
// path it will get called.
onFlowCleanup []func()

// This is true if plan is a simple insert that can be vectorized.
Expand Down Expand Up @@ -1072,7 +1075,13 @@ func flowSpecsToDiagram(
// local flow finished running. This can be called only after the physical
// planning has been completed.
func (p *PlanningCtx) getCleanupFunc() func() {
if len(p.onFlowCleanup) == 0 {
// Make sure to release the physical infrastructure after the execution
// finishes.
return p.infra.Release
}
return func() {
p.infra.Release()
for _, r := range p.onFlowCleanup {
r()
}
Expand Down Expand Up @@ -5049,18 +5058,16 @@ func (dsp *DistSQLPlanner) NewPlanningCtxWithOracle(
localityFiler roachpb.Locality,
) *PlanningCtx {
distribute := distributionType == FullDistribution
// Note: infra.Release is not added to the planCtx's onFlowCleanup
// functions below. It is instead called directly in the function
// returned by getCleanupFunc.
infra := physicalplan.NewPhysicalInfrastructure(uuid.MakeV4(), dsp.gatewaySQLInstanceID)
planCtx := &PlanningCtx{
ExtendedEvalCtx: evalCtx,
localityFilter: localityFiler,
infra: infra,
isLocal: !distribute,
planner: planner,
// Make sure to release the physical infrastructure after the execution
// finishes. Note that onFlowCleanup might not be called in some cases
// (when DistSQLPlanner.Run is not called), but that is ok since on the
// main query path it will get called.
onFlowCleanup: []func(){infra.Release},
}
if !distribute {
if planner == nil ||
Expand Down
19 changes: 12 additions & 7 deletions pkg/sql/logictest/testdata/logic_test/event_log
Original file line number Diff line number Diff line change
Expand Up @@ -117,11 +117,16 @@ WHERE "eventType" = 'alter_table'
1 {"EventType": "alter_table", "MutationID": 1, "Statement": "ALTER TABLE test.public.a ADD COLUMN val INT8", "TableName": "test.public.a", "Tag": "ALTER TABLE", "User": "root"}

query IT
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' FROM system.eventlog
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' - 'LatencyNanos' FROM system.eventlog
WHERE "eventType" = 'finish_schema_change'
----
1 {"EventType": "finish_schema_change", "InstanceID": 1}

# Verify that LatencyNanos is populated.
statement ok
SELECT 1 / coalesce((info::JSONB->'LatencyNanos')::INT, 0) FROM system.eventlog
WHERE "eventType" = 'finish_schema_change'

query I
SELECT "reportingID" FROM system.eventlog
WHERE "eventType" = 'reverse_schema_change'
Expand Down Expand Up @@ -156,20 +161,20 @@ ORDER BY "timestamp", info
1 {"EventType": "create_index", "IndexName": "foo", "MutationID": 1, "Statement": "ALTER TABLE test.public.a ADD CONSTRAINT foo UNIQUE (val)", "TableName": "test.public.a", "Tag": "ALTER TABLE", "User": "root"}

query IT
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' FROM system.eventlog
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' - 'LatencyNanos' FROM system.eventlog
WHERE "eventType" = 'finish_schema_change'
----
1 {"EventType": "finish_schema_change", "InstanceID": 1}

query IT
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' - 'Error'
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' - 'Error' - 'LatencyNanos'
FROM system.eventlog
WHERE "eventType" = 'reverse_schema_change'
----
1 {"EventType": "reverse_schema_change", "InstanceID": 1, "SQLSTATE": "23505"}

query IT
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' FROM system.eventlog
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' - 'LatencyNanos' FROM system.eventlog
WHERE "eventType" = 'finish_schema_change_rollback'
----
1 {"EventType": "finish_schema_change_rollback", "InstanceID": 1}
Expand All @@ -188,7 +193,7 @@ WHERE "eventType" = 'create_index'
1 {"EventType": "create_index", "IndexName": "a_foo", "MutationID": 1, "Statement": "CREATE INDEX a_foo ON test.public.a (val)", "TableName": "test.public.a", "Tag": "CREATE INDEX", "User": "root"}

query IT
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' FROM system.eventlog
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' - 'LatencyNanos' FROM system.eventlog
WHERE "eventType" = 'finish_schema_change'
ORDER BY "timestamp", info
----
Expand All @@ -206,7 +211,7 @@ WHERE "eventType" = 'create_index'
1 {"EventType": "create_index", "IndexName": "a_val_idx", "MutationID": 1, "Statement": "CREATE INDEX ON test.public.a (val)", "TableName": "test.public.a", "Tag": "CREATE INDEX", "User": "root"}

query IT
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' FROM system.eventlog
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' - 'LatencyNanos' FROM system.eventlog
WHERE "eventType" = 'finish_schema_change'
ORDER BY "timestamp", info
----
Expand All @@ -228,7 +233,7 @@ WHERE "eventType" = 'drop_index'
1 {"EventType": "drop_index", "IndexName": "a_foo", "MutationID": 1, "Statement": "DROP INDEX test.public.a@a_foo", "TableName": "test.public.a", "Tag": "DROP INDEX", "User": "root"}

query IT
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' FROM system.eventlog
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' - 'LatencyNanos' FROM system.eventlog
WHERE "eventType" = 'finish_schema_change'
ORDER BY "timestamp", info
----
Expand Down
19 changes: 12 additions & 7 deletions pkg/sql/logictest/testdata/logic_test/event_log_legacy
Original file line number Diff line number Diff line change
Expand Up @@ -117,11 +117,16 @@ WHERE "eventType" = 'alter_table'
1 {"EventType": "alter_table", "MutationID": 1, "Statement": "ALTER TABLE test.public.a ADD COLUMN val INT8", "TableName": "test.public.a", "Tag": "ALTER TABLE", "User": "root"}

query IT
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' FROM system.eventlog
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' - 'LatencyNanos' FROM system.eventlog
WHERE "eventType" = 'finish_schema_change'
----
1 {"EventType": "finish_schema_change", "InstanceID": 1, "MutationID": 1}

# Verify that LatencyNanos is populated.
statement ok
SELECT 1 / coalesce((info::JSONB->'LatencyNanos')::INT, 0) FROM system.eventlog
WHERE "eventType" = 'finish_schema_change'

query I
SELECT "reportingID" FROM system.eventlog
WHERE "eventType" = 'reverse_schema_change'
Expand Down Expand Up @@ -156,21 +161,21 @@ ORDER BY "timestamp", info
1 {"EventType": "alter_table", "MutationID": 2, "Statement": "ALTER TABLE test.public.a ADD CONSTRAINT foo UNIQUE (val)", "TableName": "test.public.a", "Tag": "ALTER TABLE", "User": "root"}

query IT
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' FROM system.eventlog
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' - 'LatencyNanos' FROM system.eventlog
WHERE "eventType" = 'finish_schema_change'
----
1 {"EventType": "finish_schema_change", "InstanceID": 1, "MutationID": 1}

query IT
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' - 'Error'
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' - 'Error' - 'LatencyNanos'
FROM system.eventlog
WHERE "eventType" = 'reverse_schema_change'
----
1 {"EventType": "reverse_schema_change", "InstanceID": 1, "MutationID": 2, "SQLSTATE": "23505"}


query IT
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' FROM system.eventlog
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' - 'LatencyNanos' FROM system.eventlog
WHERE "eventType" = 'finish_schema_change_rollback'
----
1 {"EventType": "finish_schema_change_rollback", "InstanceID": 1, "MutationID": 2}
Expand All @@ -189,7 +194,7 @@ WHERE "eventType" = 'create_index'
1 {"EventType": "create_index", "IndexName": "a_foo", "MutationID": 3, "Statement": "CREATE INDEX a_foo ON test.public.a (val)", "TableName": "test.public.a", "Tag": "CREATE INDEX", "User": "root"}

query IT
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' FROM system.eventlog
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' - 'LatencyNanos' FROM system.eventlog
WHERE "eventType" = 'finish_schema_change'
ORDER BY "timestamp", info
----
Expand All @@ -207,7 +212,7 @@ WHERE "eventType" = 'create_index'
1 {"EventType": "create_index", "IndexName": "a_val_idx", "MutationID": 4, "Statement": "CREATE INDEX ON test.public.a (val)", "TableName": "test.public.a", "Tag": "CREATE INDEX", "User": "root"}

query IT
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' FROM system.eventlog
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' - 'LatencyNanos' FROM system.eventlog
WHERE "eventType" = 'finish_schema_change'
ORDER BY "timestamp", info
----
Expand All @@ -229,7 +234,7 @@ WHERE "eventType" = 'drop_index'
1 {"EventType": "drop_index", "IndexName": "a_foo", "MutationID": 5, "Statement": "DROP INDEX test.public.a@a_foo", "TableName": "test.public.a", "Tag": "DROP INDEX", "User": "root"}

query IT
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' FROM system.eventlog
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' - 'LatencyNanos' FROM system.eventlog
WHERE "eventType" = 'finish_schema_change'
ORDER BY "timestamp", info
----
Expand Down
13 changes: 9 additions & 4 deletions pkg/sql/logictest/testdata/logic_test/new_schema_changer
Original file line number Diff line number Diff line change
Expand Up @@ -1100,13 +1100,18 @@ statement ok
DROP VIEW v1ev CASCADE;

query IT
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID'
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' - 'LatencyNanos'
FROM system.eventlog
ORDER BY "timestamp", info DESC
----
1 {"CascadeDroppedViews": ["test.public.v4ev"], "EventType": "drop_view", "Statement": "DROP VIEW test.public.v1ev CASCADE", "Tag": "DROP VIEW", "User": "root", "ViewName": "test.public.v1ev"}
1 {"EventType": "finish_schema_change", "InstanceID": 1}

# Verify that LatencyNanos is populated.
statement ok
SELECT 1 / coalesce((info::JSONB->'LatencyNanos')::INT, 0) FROM system.eventlog
WHERE "eventType" = 'finish_schema_change'

statement ok
CREATE VIEW v1ev AS (SELECT name FROM T1EV);

Expand All @@ -1120,7 +1125,7 @@ statement ok
DROP TABLE t1ev,t2ev CASCADE;

query IT
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID'
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' - 'LatencyNanos'
FROM system.eventlog
ORDER BY timestamp, info DESC;
----
Expand All @@ -1139,7 +1144,7 @@ statement ok
ALTER TABLE fooev ADD COLUMN j INT

query IT
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID'
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' - 'LatencyNanos'
FROM system.eventlog
ORDER BY timestamp, info DESC;
----
Expand Down Expand Up @@ -1187,7 +1192,7 @@ statement ok
DROP DATABASE db2 cascade;

query IT
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID'
SELECT "reportingID", info::JSONB - 'Timestamp' - 'DescriptorID' - 'LatencyNanos'
FROM system.eventlog
ORDER BY timestamp, info DESC;
----
Expand Down
15 changes: 11 additions & 4 deletions pkg/sql/schema_changer.go
Original file line number Diff line number Diff line change
Expand Up @@ -1932,11 +1932,16 @@ func (sc *SchemaChanger) done(ctx context.Context) error {
return err
}

startTime := timeutil.FromUnixMicros(sc.job.Payload().StartedMicros)
var info logpb.EventPayload
if isRollback {
info = &eventpb.FinishSchemaChangeRollback{}
info = &eventpb.FinishSchemaChangeRollback{
LatencyNanos: timeutil.Since(startTime).Nanoseconds(),
}
} else {
info = &eventpb.FinishSchemaChange{}
info = &eventpb.FinishSchemaChange{
LatencyNanos: timeutil.Since(startTime).Nanoseconds(),
}
}

// Log "Finish Schema Change" or "Finish Schema Change Rollback"
Expand Down Expand Up @@ -2217,14 +2222,16 @@ func (sc *SchemaChanger) maybeReverseMutations(ctx context.Context, causingError
// Log "Reverse Schema Change" event. Only the causing error and the
// mutation ID are logged; this can be correlated with the DDL statement
// that initiated the change using the mutation id.
startTime := timeutil.FromUnixMicros(sc.job.Payload().StartedMicros)
return logEventInternalForSchemaChanges(
ctx, sc.execCfg, txn,
sc.sqlInstanceID,
sc.descID,
sc.mutationID,
&eventpb.ReverseSchemaChange{
Error: fmt.Sprintf("%+v", causingError),
SQLSTATE: pgerror.GetPGCode(causingError).String(),
Error: fmt.Sprintf("%+v", causingError),
SQLSTATE: pgerror.GetPGCode(causingError).String(),
LatencyNanos: timeutil.Since(startTime).Nanoseconds(),
})
})
if err != nil || alreadyReversed {
Expand Down
3 changes: 3 additions & 0 deletions pkg/sql/schemachanger/scdeps/sctestdeps/test_deps.go
Original file line number Diff line number Diff line change
Expand Up @@ -1299,6 +1299,9 @@ func (s *TestState) logEvent(event logpb.EventPayload) error {
// Remove common details from text output, they're never decorated.
if inM, ok := in.(map[string]interface{}); ok {
delete(inM, "common")

// Also remove latency measurement, since it's not deterministic.
delete(inM, "latencyNanos")
}
})
if err != nil {
Expand Down
2 changes: 2 additions & 0 deletions pkg/sql/schemachanger/scjob/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,11 +141,13 @@ func (n *newSchemaChangeResumer) run(ctx context.Context, execCtxI interface{})
return nil
}

startTime := timeutil.FromUnixMicros(payload.StartedMicros)
err := scrun.RunSchemaChangesInJob(
ctx,
execCfg.DeclarativeSchemaChangerTestingKnobs,
n.deps,
n.job.ID(),
startTime,
payload.DescriptorIDs,
n.rollbackCause,
)
Expand Down
1 change: 1 addition & 0 deletions pkg/sql/schemachanger/scrun/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ go_library(
"//pkg/util/log/logpb",
"//pkg/util/mon",
"//pkg/util/protoutil",
"//pkg/util/timeutil",
"@com_github_cockroachdb_errors//:errors",
"@com_github_cockroachdb_redact//:redact",
],
Expand Down
19 changes: 14 additions & 5 deletions pkg/sql/schemachanger/scrun/scrun.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ package scrun
import (
"context"
"fmt"
"time"

"github.com/cockroachdb/cockroach/pkg/clusterversion"
"github.com/cockroachdb/cockroach/pkg/jobs"
Expand All @@ -27,6 +28,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/log/eventpb"
"github.com/cockroachdb/cockroach/pkg/util/mon"
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/redact"
)
Expand Down Expand Up @@ -112,6 +114,7 @@ func RunSchemaChangesInJob(
knobs *scexec.TestingKnobs,
deps JobRunDependencies,
jobID jobspb.JobID,
jobStartTime time.Time,
descriptorIDs []descpb.ID,
rollbackCause error,
) error {
Expand All @@ -120,7 +123,7 @@ func RunSchemaChangesInJob(
return err
}
}
p, err := makePostCommitPlan(ctx, deps, jobID, descriptorIDs, rollbackCause)
p, err := makePostCommitPlan(ctx, deps, jobID, jobStartTime, descriptorIDs, rollbackCause)
if err != nil {
if knobs != nil && knobs.OnPostCommitPlanError != nil {
return knobs.OnPostCommitPlanError(err)
Expand All @@ -141,9 +144,13 @@ func RunSchemaChangesInJob(
if i+1 == len(p.Stages) {
var template eventpb.EventWithCommonSchemaChangePayload
if p.CurrentState.InRollback {
template = &eventpb.FinishSchemaChangeRollback{}
template = &eventpb.FinishSchemaChangeRollback{
LatencyNanos: timeutil.Since(jobStartTime).Nanoseconds(),
}
} else {
template = &eventpb.FinishSchemaChange{}
template = &eventpb.FinishSchemaChange{
LatencyNanos: timeutil.Since(jobStartTime).Nanoseconds(),
}
}
if err := logSchemaChangeEvents(ctx, el, p.CurrentState, template); err != nil {
return err
Expand Down Expand Up @@ -218,6 +225,7 @@ func makePostCommitPlan(
ctx context.Context,
deps JobRunDependencies,
jobID jobspb.JobID,
jobStartTime time.Time,
descriptorIDs []descpb.ID,
rollbackCause error,
) (scplan.Plan, error) {
Expand Down Expand Up @@ -252,8 +260,9 @@ func makePostCommitPlan(
// Revert the schema change and write about it in the event log.
state.Rollback()
return logSchemaChangeEvents(ctx, eventLogger, state, &eventpb.ReverseSchemaChange{
Error: fmt.Sprintf("%v", rollbackCause),
SQLSTATE: pgerror.GetPGCode(rollbackCause).String(),
Error: fmt.Sprintf("%v", rollbackCause),
SQLSTATE: pgerror.GetPGCode(rollbackCause).String(),
LatencyNanos: timeutil.Since(jobStartTime).Nanoseconds(),
})
}
return nil
Expand Down
Loading

0 comments on commit 45aae10

Please sign in to comment.