From c7cd841d378242771ba44bd225ab663c42525db5 Mon Sep 17 00:00:00 2001 From: Andrew Werner Date: Thu, 23 Jun 2022 00:55:59 -0400 Subject: [PATCH] sql: update progress in column backfiller non-transactionally Backports will address #83202. Release note (bug fix): Due to a bug in transaction lifetime management, a lock could be held for a long period of time when adding a new column to a table (or altering a column type). This contention could make the jobs page non-responsive and job adoption slow. This bug has now been fixed. --- pkg/sql/backfill.go | 12 ++++++- pkg/sql/schema_changer_test.go | 62 ++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/pkg/sql/backfill.go b/pkg/sql/backfill.go index f5af90ae8223..4eac65b9e7ea 100644 --- a/pkg/sql/backfill.go +++ b/pkg/sql/backfill.go @@ -1255,7 +1255,17 @@ func (sc *SchemaChanger) distColumnBackfill( if nRanges < origNRanges { fractionRangesFinished := float32(origNRanges-nRanges) / float32(origNRanges) fractionCompleted := origFractionCompleted + fractionLeft*fractionRangesFinished - if err := sc.job.FractionProgressed(ctx, txn, jobs.FractionUpdater(fractionCompleted)); err != nil { + // Note that this explicitly uses a nil txn, which will lead to a new + // transaction being created as a part of this update. In general, the + // transaction in scope here is dubious: it exists, presumably, for the + // purpose of holding descriptor leases, but it isn't used in any of + // the writing logic of performing the backfill chunk. If we were to + // use it here, we'd place a lock on the jobs table which would not be + // released until the backfill chunk finished. That migh be quite a + // long time. + if err := sc.job.FractionProgressed( + ctx, nil /* txn */, jobs.FractionUpdater(fractionCompleted), + ); err != nil { return jobs.SimplifyInvalidStatusError(err) } } diff --git a/pkg/sql/schema_changer_test.go b/pkg/sql/schema_changer_test.go index a31f9082080c..fdf3767247f7 100644 --- a/pkg/sql/schema_changer_test.go +++ b/pkg/sql/schema_changer_test.go @@ -8307,3 +8307,65 @@ func TestVirtualColumnNotAllowedInPkeyBefore22_1(t *testing.T) { require.Error(t, err) require.Equal(t, "pq: cannot use virtual column \"b\" in primary key", err.Error()) } + +// TestColumnBackfillProcessingDoesNotHoldLockOnJobsTable is a +// regression test to ensure that when the column backfill progresses +// to the next backfill chunk and it needs to update its progress, it +// does not hold a lock on the jobs table for the duration of processing +// the next chunk. +func TestColumnBackfillProcessingDoesNotHoldLockOnJobsTable(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + ctx := context.Background() + + params, _ := tests.CreateTestServerParams() + chCh := make(chan chan error) + params.Knobs.DistSQL = &execinfra.TestingKnobs{ + RunBeforeBackfillChunk: func(sp roachpb.Span) error { + ch := make(chan error) + chCh <- ch + return <-ch + }, + } + params.Knobs.SQLSchemaChanger = &sql.SchemaChangerTestingKnobs{ + BackfillChunkSize: 1, + WriteCheckpointInterval: time.Nanosecond, + } + + s, sqlDB, _ := serverutils.StartServer(t, params) + defer s.Stopper().Stop(ctx) + tdb := sqlutils.MakeSQLRunner(sqlDB) + tdb.Exec(t, "CREATE TABLE foo (i INT PRIMARY KEY)") + tdb.Exec(t, "INSERT INTO foo SELECT * FROM generate_series(1, 10)") + tdb.Exec(t, "ALTER TABLE foo SPLIT AT SELECT * FROM generate_series(1, 9)") + errCh := make(chan error) + go func() { + _, err := sqlDB.Exec(` +SET use_declarative_schema_changer = 'off'; +ALTER TABLE foo ADD COLUMN j INT DEFAULT 42; +`) + errCh <- err + }() + // Wait for one iteration. + close(<-chCh) + // Wait for another iteration. + ch := <-chCh + // Ensure that the progress has been set to something non-zero, and + // that a lock has not been held. + tdb.CheckQueryResults(t, ` +SELECT fraction_completed > 0 + FROM crdb_internal.jobs + WHERE description LIKE '%ADD COLUMN j INT8 DEFAULT 42'`, + [][]string{{"true"}}) + close(ch) + for { + select { + case ch := <-chCh: + close(ch) + case err := <-errCh: + require.NoError(t, err) + return + } + } +}