Skip to content

Commit

Permalink
Merge #105639
Browse files Browse the repository at this point in the history
105639: roachtest: fix admission-control/{index-backfill,database-drop} r=irfansharif a=irfansharif

These two roachtests previously attempted to (opportunistically) share disk snapshots. In #105260 we observed that when running simultaneously, the two clusters end up using the same cluster ID and there's cross-talk from persisted gossip state where we record IP addresses. This commit prevents this snapshot re-use, giving each test its own one. While here, we reduce the cadence of these tests to be weekly runs instead, since they've (otherwise) been non-flakey.

Fixes #105260.
Fixes #105261.

Release note: None

Co-authored-by: irfan sharif <[email protected]>
  • Loading branch information
craig[bot] and irfansharif committed Jun 29, 2023
2 parents 72b199f + 453f7f4 commit 4104f66
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 78 deletions.
98 changes: 29 additions & 69 deletions pkg/cmd/roachtest/tests/admission_control_database_drop.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,11 @@ func registerDatabaseDrop(r registry.Registry) {
clusterSpec.GCEVolumeType = "pd-ssd"

r.Add(registry.TestSpec{
Name: "admission-control/database-drop",
Timeout: 10 * time.Hour,
Owner: registry.OwnerAdmissionControl,
Benchmark: true,
// TODO(irfansharif): Reduce to weekly cadence once stabilized.
// Tags: registry.Tags(`weekly`),
Name: "admission-control/database-drop",
Timeout: 10 * time.Hour,
Owner: registry.OwnerAdmissionControl,
Benchmark: true,
Tags: registry.Tags(`weekly`),
Cluster: clusterSpec,
RequiresLicense: true,
SnapshotPrefix: "droppable-database-tpce-100k",
Expand Down Expand Up @@ -81,70 +80,31 @@ func registerDatabaseDrop(r registry.Registry) {
// explicit throughout. It works, but too tacitly.
c.Run(ctx, c.All(), fmt.Sprintf("cp %s ./cockroach", path))

// Opportunistically try to make use of the tpce-100k snapshot,
// which another test creates.
tpce100kSnapshots, err := c.ListSnapshots(ctx, vm.VolumeSnapshotListOpts{
NamePrefix: tpce100kSnapshotPrefix,
})
if err != nil {
t.Fatal(err)
}
if len(tpce100kSnapshots) == 0 {
// Welp -- we don't have that either. Start from the start
// of time, by creating the first TPC-E 100k data set.
t.L().Printf("inner: no existing snapshots found for %s, doing pre-work",
t.SnapshotPrefix())

// Set up TPC-E with 100k customers. Do so using a published
// CRDB release, since we'll use this state to generate disk
// snapshots.
runTPCE(ctx, t, c, tpceOptions{
start: func(ctx context.Context, t test.Test, c cluster.Cluster) {
settings := install.MakeClusterSettings(install.NumRacksOption(crdbNodes))
if err := c.StartE(ctx, t.L(), option.DefaultStartOptsNoBackups(), settings, c.Range(1, crdbNodes)); err != nil {
t.Fatal(err)
}
},
customers: 100_000,
disablePrometheus: true,
setupType: usingTPCEInit,
estimatedSetupTime: 4 * time.Hour,
nodes: crdbNodes,
cpus: clusterSpec.CPUs,
ssds: 1,
onlySetup: true,
})

// NB: Intentionally don't try to create tpce-100k snapshots
// here. There's no way to guarantee that there's only one
// writer writing these snapshots, and we leave it to the
// test that actually defined these tests.
//
// TODO(irfansharif): This is a bit awkward. Using snapshots
// across tests that use the same cluster spec is a
// reasonable pattern. We could do the
// following:
// - Creating snapshot-creation-only tests.
// - Including a UUID when creating snapshots, and then when
// listing them, listing only the one that sorts earliest
// alphabetically. This circumvents the multi-writer
// problem.
// - Opportunistically prune lower-sorted snapshots since
// they're not actually being used.
} else {
// Great! We can start with an existing tpce-100k dataset.
// Apply it and then make relevant changes on top.
t.L().Printf("inner: using %d pre-existing snapshot(s) with prefix %q",
len(tpce100kSnapshots), "tpce-100k")
if err := c.ApplySnapshots(ctx, tpce100kSnapshots); err != nil {
t.Fatal(err)
}
// Set up TPC-E with 100k customers.
//
// TODO(irfansharif): We can't simply re-use the snapshots
// already created by admission_control_index_backfill.go,
// though that'd be awfully convenient. When the two tests run
// simultaneously, the two clusters end up using the same
// cluster ID and there's cross-talk from persisted gossip state
// where we record IP addresses.

settings := install.MakeClusterSettings(install.NumRacksOption(crdbNodes))
if err := c.StartE(ctx, t.L(), option.DefaultStartOptsNoBackups(), settings, c.Range(1, crdbNodes)); err != nil {
t.Fatal(err)
}
}
runTPCE(ctx, t, c, tpceOptions{
start: func(ctx context.Context, t test.Test, c cluster.Cluster) {
settings := install.MakeClusterSettings(install.NumRacksOption(crdbNodes))
if err := c.StartE(ctx, t.L(), option.DefaultStartOptsNoBackups(), settings, c.Range(1, crdbNodes)); err != nil {
t.Fatal(err)
}
},
customers: 100_000,
disablePrometheus: true,
setupType: usingTPCEInit,
estimatedSetupTime: 4 * time.Hour,
nodes: crdbNodes,
cpus: clusterSpec.CPUs,
ssds: 1,
onlySetup: true,
})

// We have to create the droppable database next. We're going to
// do it by renaming the existing TPC-E database and re-creating
Expand Down
15 changes: 6 additions & 9 deletions pkg/cmd/roachtest/tests/admission_control_index_backfill.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/version"
)

const tpce100kSnapshotPrefix = "tpce-100k"

func registerIndexBackfill(r registry.Registry) {
clusterSpec := r.MakeClusterSpec(
10, /* nodeCount */
Expand All @@ -43,15 +41,14 @@ func registerIndexBackfill(r registry.Registry) {
clusterSpec.GCEVolumeType = "pd-ssd"

r.Add(registry.TestSpec{
Name: "admission-control/index-backfill",
Timeout: 6 * time.Hour,
Owner: registry.OwnerAdmissionControl,
Benchmark: true,
// TODO(irfansharif): Reduce to weekly cadence once stabilized.
// Tags: registry.Tags(`weekly`),
Name: "admission-control/index-backfill",
Timeout: 6 * time.Hour,
Owner: registry.OwnerAdmissionControl,
Benchmark: true,
Tags: registry.Tags(`weekly`),
Cluster: clusterSpec,
RequiresLicense: true,
SnapshotPrefix: tpce100kSnapshotPrefix,
SnapshotPrefix: "index-backfill-tpce-100k",
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
crdbNodes := c.Spec().NodeCount - 1
workloadNode := c.Spec().NodeCount
Expand Down

0 comments on commit 4104f66

Please sign in to comment.