Skip to content

Commit

Permalink
backupccl: fingerprint 15GB restore roachtests
Browse files Browse the repository at this point in the history
Previously, restore roachtests had little ability to detect data corruption
regressions across runs. This patch introduces this ability. Specifically,
this commit allows the restore roachtest writer to easily run a stripped
fingerprint after a restore, and assert a match to the hardcoded fingerprint
in the test spec.

For now, the fingerprint check is only run on the restore roachtests that
restore 15GB of data. The check takes about the same amount of time it takes to
run the restore (around 3 minutes), so before we use it on larger tests, we
ought to consider adding performance improvements to the fingerprinting tool.
These tests include:
- restore/nodeShutdown/coordinator
- restore/pause/tpce/15GB/aws/nodes=4/cpus=8 (used to restore 80GB)
- restore/tpce/15GB/aws/nodes=4/cpus=8 (new test)
- restore/nodeShutdown/worker (used to restore 80GB)
- restore/nodeShutdown/coordinator (used to restore 80GB)

This patch also changes the node shutdown tests and the paused restore test to
run the smaller 15GB tpce fixture, as it speeds the test run up.

Informs cockroachdb#98779

Release note: none
  • Loading branch information
msbutler committed Mar 28, 2023
1 parent 20e41ff commit c284398
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 8 deletions.
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/tests/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ go_library(
"//pkg/internal/sqlsmith",
"//pkg/jobs",
"//pkg/jobs/jobspb",
"//pkg/keys",
"//pkg/kv",
"//pkg/kv/kvpb",
"//pkg/multitenant/mtinfopb",
Expand Down
64 changes: 56 additions & 8 deletions pkg/cmd/roachtest/tests/restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ import (
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
"github.com/cockroachdb/cockroach/pkg/jobs"
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
"github.com/cockroachdb/cockroach/pkg/keys"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
Expand All @@ -46,9 +48,10 @@ func registerRestoreNodeShutdown(r registry.Registry) {
sp := restoreSpecs{
hardware: makeHardwareSpecs(hardwareSpecs{}),
backup: makeBackupSpecs(
backupSpecs{workload: tpceRestore{customers: 5000},
backupSpecs{workload: tpceRestore{customers: 1000},
version: "v22.2.1"}),
timeout: 1 * time.Hour,
timeout: 1 * time.Hour,
fingerprint: 8445446819555404274,
}

makeRestoreStarter := func(ctx context.Context, t test.Test, c cluster.Cluster, gatewayNode int) jobStarter {
Expand All @@ -75,6 +78,7 @@ func registerRestoreNodeShutdown(r registry.Registry) {
c.Put(ctx, t.Cockroach(), "./cockroach")
c.Start(ctx, t.L(), option.DefaultStartOptsNoBackups(), install.MakeClusterSettings())
jobSurvivesNodeShutdown(ctx, t, c, nodeToShutdown, makeRestoreStarter(ctx, t, c, gatewayNode))
sp.checkFingerprint(ctx)
},
})

Expand All @@ -97,6 +101,7 @@ func registerRestoreNodeShutdown(r registry.Registry) {
c.Start(ctx, t.L(), option.DefaultStartOptsNoBackups(), install.MakeClusterSettings())

jobSurvivesNodeShutdown(ctx, t, c, nodeToShutdown, makeRestoreStarter(ctx, t, c, gatewayNode))
sp.checkFingerprint(ctx)
},
})
}
Expand All @@ -112,10 +117,11 @@ func registerRestore(r registry.Registry) {
withPauseSpecs := restoreSpecs{
hardware: makeHardwareSpecs(hardwareSpecs{}),
backup: makeBackupSpecs(
backupSpecs{workload: tpceRestore{customers: 5000},
backupSpecs{workload: tpceRestore{customers: 1000},
version: "v22.2.1"}),
timeout: 3 * time.Hour,
namePrefix: "pause",
timeout: 3 * time.Hour,
namePrefix: "pause",
fingerprint: 8445446819555404274,
}
withPauseSpecs.initTestName()

Expand Down Expand Up @@ -254,6 +260,7 @@ func registerRestore(r registry.Registry) {
}
}
metricCollector()
withPauseSpecs.checkFingerprint(ctx)
return nil
})
m.Wait()
Expand Down Expand Up @@ -327,6 +334,15 @@ func registerRestore(r registry.Registry) {
timeout: 24 * time.Hour,
tags: []string{"weekly", "aws-weekly"},
},
{
// A teeny weeny 15GB restore that could be used to bisect scale agnostic perf regressions.
hardware: makeHardwareSpecs(hardwareSpecs{}),
backup: makeBackupSpecs(
backupSpecs{workload: tpceRestore{customers: 1000},
version: "v22.2.1"}),
timeout: 3 * time.Hour,
fingerprint: 8445446819555404274,
},
// TODO(msbutler): add the following tests once roachperf/grafana is hooked up and old tests are
// removed:
// - restore/tpce/400GB/nodes=30
Expand Down Expand Up @@ -372,6 +388,7 @@ func registerRestore(r registry.Registry) {
return err
}
metricCollector()
sp.checkFingerprint(ctx)
return nil
})
m.Wait()
Expand Down Expand Up @@ -568,6 +585,8 @@ func (tpce tpceRestore) String() string {
var builder strings.Builder
builder.WriteString("tpce/")
switch tpce.customers {
case 1000:
builder.WriteString("15GB")
case 5000:
builder.WriteString("80GB")
case 25000:
Expand All @@ -591,9 +610,10 @@ type restoreSpecs struct {
// namePrefix appears in the name of the roachtest, i.e. `restore/{prefix}/{config}`.
namePrefix string

t test.Test
c cluster.Cluster
testName string
t test.Test
c cluster.Cluster
testName string
fingerprint int
}

func (sp *restoreSpecs) initTestName() {
Expand Down Expand Up @@ -680,6 +700,34 @@ func (sp *restoreSpecs) initRestorePerfMetrics(
}
}

// checkFingerprint runs a stripped fingerprint on all user tables in the cluster if the restore
// spec has a nonzero fingerprint.
func (sp *restoreSpecs) checkFingerprint(ctx context.Context) {
if sp.fingerprint == 0 {
sp.t.L().Printf("Fingerprint not found in specs. Skipping fingerprint check.")
return
}

conn, err := sp.c.ConnE(ctx, sp.t.L(), sp.c.Node(1)[0])
require.NoError(sp.t, err)
sql := sqlutils.MakeSQLRunner(conn)

var minUserTableID, maxUserTableID uint32
sql.QueryRow(sp.t, `SELECT min(id) FROM system.namespace WHERE "parentID" >1`).Scan(&minUserTableID)
sql.QueryRow(sp.t, `SELECT max(id) FROM system.namespace WHERE "parentID" >1`).Scan(&maxUserTableID)

codec := keys.MakeSQLCodec(roachpb.SystemTenantID)
startKey := codec.TablePrefix(minUserTableID)
endkey := codec.TablePrefix(maxUserTableID).PrefixEnd()

startTime := timeutil.Now()
var fingerprint int
sql.QueryRow(sp.t, `SELECT * FROM crdb_internal.fingerprint(ARRAY[$1::BYTES, $2::BYTES],true)`,
startKey, endkey).Scan(&fingerprint)
sp.t.L().Printf("Fingerprint is %d. Took %.2f minutes", fingerprint, timeutil.Since(startTime).Minutes())
require.Equal(sp.t, sp.fingerprint, fingerprint, "user table fingerprint mismatch")
}

// exportToRoachperf exports a single perf metric for the given test to roachperf.
func exportToRoachperf(
ctx context.Context, t test.Test, c cluster.Cluster, testName string, metric int64,
Expand Down

0 comments on commit c284398

Please sign in to comment.