diff --git a/pkg/ccl/backupccl/backup_job.go b/pkg/ccl/backupccl/backup_job.go index f3c80d59649e..e0b1ed0b1ea0 100644 --- a/pkg/ccl/backupccl/backup_job.go +++ b/pkg/ccl/backupccl/backup_job.go @@ -349,39 +349,14 @@ func backup( return roachpb.RowCount{}, err } - var tableStatistics []*stats.TableStatisticProto - for i := range backupManifest.Descriptors { - if tbl, _, _, _, _ := descpb.GetDescriptors(&backupManifest.Descriptors[i]); tbl != nil { - tableDesc := tabledesc.NewBuilder(tbl).BuildImmutableTable() - // Collect all the table stats for this table. - tableStatisticsAcc, err := statsCache.GetTableStats(ctx, tableDesc) - if err != nil { - // Successfully backed up data is more valuable than table stats that can - // be recomputed after restore, and so if we fail to collect the stats of a - // table we do not want to mark the job as failed. - // The lack of stats on restore could lead to suboptimal performance when - // reading/writing to this table until the stats have been recomputed. - log.Warningf(ctx, "failed to collect stats for table: %s, "+ - "table ID: %d during a backup: %s", tableDesc.GetName(), tableDesc.GetID(), - err.Error()) - continue - } - for _, stat := range tableStatisticsAcc { - tableStatistics = append(tableStatistics, &stat.TableStatisticProto) - } - } - } - statsTable := backuppb.StatsTable{ - Statistics: tableStatistics, - } - + statsTable := getTableStatsForBackup(ctx, statsCache, backupManifest.Descriptors) if err := backupinfo.WriteTableStatistics(ctx, defaultStore, encryption, &kmsEnv, &statsTable); err != nil { return roachpb.RowCount{}, err } if backupinfo.WriteMetadataSST.Get(&settings.SV) { if err := backupinfo.WriteBackupMetadataSST(ctx, defaultStore, encryption, &kmsEnv, backupManifest, - tableStatistics); err != nil { + statsTable.Statistics); err != nil { err = errors.Wrap(err, "writing forward-compat metadata sst") if !build.IsRelease() { return roachpb.RowCount{}, err @@ -410,6 +385,50 @@ func releaseProtectedTimestamp( return err } +// getTableStatsForBackup collects all stats for tables found in descs. +// +// We do not fail if we can't retrieve statistiscs. Successfully +// backed up data is more valuable than table stats that can be +// recomputed after restore. The lack of stats on restore could lead +// to suboptimal performance when reading/writing to this table until +// the stats have been recomputed. +func getTableStatsForBackup( + ctx context.Context, statsCache *stats.TableStatisticsCache, descs []descpb.Descriptor, +) backuppb.StatsTable { + var tableStatistics []*stats.TableStatisticProto + for i := range descs { + if tbl, _, _, _, _ := descpb.GetDescriptors(&descs[i]); tbl != nil { + tableDesc := tabledesc.NewBuilder(tbl).BuildImmutableTable() + tableStatisticsAcc, err := statsCache.GetTableStats(ctx, tableDesc) + if err != nil { + log.Warningf(ctx, "failed to collect stats for table: %s, "+ + "table ID: %d during a backup: %s", tableDesc.GetName(), tableDesc.GetID(), + err.Error()) + continue + } + + for _, stat := range tableStatisticsAcc { + if statShouldBeIncludedInBackupRestore(&stat.TableStatisticProto) { + tableStatistics = append(tableStatistics, &stat.TableStatisticProto) + } + } + } + } + return backuppb.StatsTable{ + Statistics: tableStatistics, + } +} + +func statShouldBeIncludedInBackupRestore(stat *stats.TableStatisticProto) bool { + // Forecasts and merged stats are computed from the persisted + // stats on demand and do not need to be backed up or + // restored. + if stat.Name == jobspb.ForecastStatsName || stat.Name == jobspb.MergedStatsName { + return false + } + return true +} + type backupResumer struct { job *jobs.Job backupStats roachpb.RowCount diff --git a/pkg/ccl/backupccl/backupinfo/backup_metadata.go b/pkg/ccl/backupccl/backupinfo/backup_metadata.go index 29fea0b17371..efc648336e9c 100644 --- a/pkg/ccl/backupccl/backupinfo/backup_metadata.go +++ b/pkg/ccl/backupccl/backupinfo/backup_metadata.go @@ -500,12 +500,12 @@ func writeStatsToMetadata( return stats[i].TableID < stats[j].TableID || (stats[i].TableID == stats[j].TableID && stats[i].StatisticID < stats[j].StatisticID) }) - for _, i := range stats { - b, err := protoutil.Marshal(i) + for _, s := range stats { + b, err := protoutil.Marshal(s) if err != nil { return err } - if err := sst.PutUnversioned(encodeStatSSTKey(i.TableID, i.StatisticID), b); err != nil { + if err := sst.PutUnversioned(encodeStatSSTKey(s.TableID, s.StatisticID), b); err != nil { return err } } diff --git a/pkg/ccl/backupccl/full_cluster_backup_restore_test.go b/pkg/ccl/backupccl/full_cluster_backup_restore_test.go index c6b0c68f3679..0119a4db0a58 100644 --- a/pkg/ccl/backupccl/full_cluster_backup_restore_test.go +++ b/pkg/ccl/backupccl/full_cluster_backup_restore_test.go @@ -609,12 +609,6 @@ func TestClusterRestoreFailCleanup(t *testing.T) { _, sqlDB, tempDir, cleanupFn := backupRestoreTestSetup(t, singleNode, numAccounts, InitManualReplication) defer cleanupFn() - // This test flakes due to - // https://github.com/cockroachdb/cockroach/issues/86806. Instead of skipping - // the test all together, setting the cluster setting to false which triggers - // the failure. - sqlDB.Exec(t, "SET CLUSTER SETTING kv.bulkio.write_metadata_sst.enabled=false") - // Setup the system systemTablesToVerify to ensure that they are copied to the new cluster. // Populate system.users. for i := 0; i < 1000; i++ { @@ -1078,7 +1072,7 @@ func TestRestoreWithRecreatedDefaultDB(t *testing.T) { sqlDB.Exec(t, ` DROP DATABASE defaultdb; -CREATE DATABASE defaultdb; +CREATE DATABASE defaultdb; `) sqlDB.Exec(t, `BACKUP TO $1`, localFoo) diff --git a/pkg/ccl/backupccl/restore_job.go b/pkg/ccl/backupccl/restore_job.go index 8b3e0d835eae..2bf05a1d62a4 100644 --- a/pkg/ccl/backupccl/restore_job.go +++ b/pkg/ccl/backupccl/restore_job.go @@ -524,13 +524,16 @@ func (r *restoreResumer) ForceRealSpan() bool { return true } -// remapRelevantStatistics changes the table ID references in the stats -// from those they had in the backed up database to what they should be -// in the restored database. -// It also selects only the statistics which belong to one of the tables -// being restored. If the descriptorRewrites can re-write the table ID, then that -// table is being restored. -func remapRelevantStatistics( +// remapAndFilterRelevantStatistics changes the table ID references in +// the stats from those they had in the backed up database to what +// they should be in the restored database. +// +// It also selects only the statistics which belong to one of the +// tables being restored. If the descriptorRewrites can re-write the +// table ID, then that table is being restored. +// +// Any statistics forecasts are ignored. +func remapAndFilterRelevantStatistics( ctx context.Context, tableStatistics []*stats.TableStatisticProto, descriptorRewrites jobspb.DescRewriteMap, @@ -540,11 +543,13 @@ func remapRelevantStatistics( tableHasStatsInBackup := make(map[descpb.ID]struct{}) for _, stat := range tableStatistics { - tableHasStatsInBackup[stat.TableID] = struct{}{} - if tableRewrite, ok := descriptorRewrites[stat.TableID]; ok { - // Statistics imported only when table re-write is present. - stat.TableID = tableRewrite.ID - relevantTableStatistics = append(relevantTableStatistics, stat) + if statShouldBeIncludedInBackupRestore(stat) { + tableHasStatsInBackup[stat.TableID] = struct{}{} + if tableRewrite, ok := descriptorRewrites[stat.TableID]; ok { + // Statistics imported only when table re-write is present. + stat.TableID = tableRewrite.ID + relevantTableStatistics = append(relevantTableStatistics, stat) + } } } @@ -1555,7 +1560,7 @@ func (r *restoreResumer) doResume(ctx context.Context, execCtx interface{}) erro backupStats, err := backupinfo.GetStatisticsFromBackup(ctx, defaultStore, details.Encryption, &kmsEnv, latestBackupManifest) if err == nil { - remappedStats = remapRelevantStatistics(ctx, backupStats, details.DescriptorRewrites, + remappedStats = remapAndFilterRelevantStatistics(ctx, backupStats, details.DescriptorRewrites, details.TableDescs) } else { // We don't want to fail the restore if we are unable to resolve statistics diff --git a/pkg/ccl/backupccl/testdata/backup-restore/in-progress-import-rollback b/pkg/ccl/backupccl/testdata/backup-restore/in-progress-import-rollback index 5cd89c3e29ab..97cb3a24e656 100644 --- a/pkg/ccl/backupccl/testdata/backup-restore/in-progress-import-rollback +++ b/pkg/ccl/backupccl/testdata/backup-restore/in-progress-import-rollback @@ -32,11 +32,6 @@ exec-sql SET CLUSTER SETTING jobs.debug.pausepoints = 'import.after_ingest'; ---- -exec-sql -SET CLUSTER SETTING kv.bulkio.write_metadata_sst.enabled = false; ----- - - exec-sql SET CLUSTER SETTING storage.mvcc.range_tombstones.enabled = false; ---- @@ -247,10 +242,6 @@ exec-sql SET CLUSTER SETTING storage.mvcc.range_tombstones.enabled = true; ---- -exec-sql -SET CLUSTER SETTING kv.bulkio.write_metadata_sst.enabled = false; ----- - # Pause the import job, in order to back up the importing data. import expect-pausepoint tag=b IMPORT INTO foo2 (i,s) CSV DATA ('nodelocal://0/export1/export*-n*.0.csv') diff --git a/pkg/ccl/backupccl/testdata/backup-restore/metadata b/pkg/ccl/backupccl/testdata/backup-restore/metadata new file mode 100644 index 000000000000..7f62813a3ba6 --- /dev/null +++ b/pkg/ccl/backupccl/testdata/backup-restore/metadata @@ -0,0 +1,42 @@ +new-cluster name=s +---- + +# Make sure that the presence of forecasted stats doesn't break +# metadata SST writing. This is a regression test for #86806. +# +# We create 3 statistics manually which is the minimum needed to +# create a forecast. Automatic collection is disabled to avoid +# flakiness. +exec-sql +SET CLUSTER SETTING kv.bulkio.write_metadata_sst.enabled = true; +---- + +exec-sql +SET CLUSTER SETTING sql.stats.automatic_collection.enabled = false; +---- + +exec-sql +CREATE DATABASE db1; +USE db1; +CREATE TABLE tab (a INT PRIMARY KEY, b INT); +---- + +exec-sql +CREATE STATISTICS __auto__ ON a FROM tab; +CREATE STATISTICS __auto__ ON b FROM tab; +CREATE STATISTICS __auto__ ON a FROM tab; +CREATE STATISTICS __auto__ ON b FROM tab; +CREATE STATISTICS __auto__ ON a FROM tab; +CREATE STATISTICS __auto__ ON b FROM tab; +---- + +# The previous bug occurs when there are two or more forecasts since +# forecasts do not hava a statistics ID. +query-sql +SELECT count(1) FROM [ SHOW STATISTICS FOR TABLE tab WITH FORECAST ] WHERE statistics_name = '__forecast__' +---- +2 + +exec-sql +BACKUP DATABASE db1 INTO 'nodelocal://0/test/' +----