Skip to content

Commit

Permalink
Merge #130194
Browse files Browse the repository at this point in the history
130194: roachtest: separate-process deployments in mixedversion r=herkolategan,DarrylWong,srosenberg a=renatolabs

This commit adds support `separate-process` deployments in
`mixedversion` tests. It is a natural enhancement of previous work
where we introduced `shared-process` deployments.

When a test runs in a `separate-process` deployment mode, each upgrade
will involve restarting the binaries for both the system tenant (also
known as the storage cluster), as well as for the
application (non-system) tenant. The storage cluster needs to finalize
its upgrade before the application tenant starts upgrading.

Mixed-version functions can be called while the system tenant is
upgrading and/or while the application tenant is upgrading.

For simplicity, only one separate-process tenant is created, and
handles traffic generated by the test. In addition, a SQL server is
started on every node where we run a storage cluster binary. In other
words, the two processes are colocated (without any form of resource
allocation) on the same nodes.

For now, separate-process deployments are disabled on a number of
tests where it's known to fail. We should, over time, investigate
these failures and re-enable the new deployment mode on them.

Epic: none

Release note: None

Co-authored-by: Renato Costa <[email protected]>
  • Loading branch information
craig[bot] and renatolabs committed Sep 27, 2024
2 parents 83f4ec6 + fcc447c commit 008333e
Show file tree
Hide file tree
Showing 33 changed files with 1,339 additions and 609 deletions.
25 changes: 21 additions & 4 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -1333,7 +1333,11 @@ func (c *clusterImpl) FetchTimeseriesData(ctx context.Context, l *logger.Logger)
sec = fmt.Sprintf("--certs-dir=%s", certs)
}
if err := c.RunE(
ctx, option.WithNodes(c.Node(node)), fmt.Sprintf("%s debug tsdump %s --port={pgport%s} --format=raw > tsdump.gob", test.DefaultCockroachPath, sec, c.Node(node)),
ctx, option.WithNodes(c.Node(node)),
fmt.Sprintf(
"%s debug tsdump %s --port={pgport%s:%s} --format=raw > tsdump.gob",
test.DefaultCockroachPath, sec, c.Node(node), install.SystemInterfaceName,
),
); err != nil {
return err
}
Expand Down Expand Up @@ -1408,8 +1412,14 @@ func (c *clusterImpl) FetchDebugZip(
// assumption that a down node will refuse the connection, so it won't
// waste our time.
for _, node := range nodes {
// `cockroach debug zip` does not support non root authentication.
nodePgUrl, err := c.InternalPGUrl(ctx, l, c.Node(node), roachprod.PGURLOptions{Auth: install.AuthRootCert})
pgURLOpts := roachprod.PGURLOptions{
// `cockroach debug zip` does not support non root authentication.
Auth: install.AuthRootCert,
// request the system tenant specifically in case the test
// changed the default virtual cluster.
VirtualClusterName: install.SystemInterfaceName,
}
nodePgUrl, err := c.InternalPGUrl(ctx, l, c.Node(node), pgURLOpts)
if err != nil {
l.Printf("cluster.FetchDebugZip failed to retrieve PGUrl on node %d: %v", node, err)
continue
Expand Down Expand Up @@ -2565,9 +2575,12 @@ func (c *clusterImpl) RunWithDetails(
}

l.Printf("> %s", cmd)
expanderCfg := install.ExpanderConfig{
DefaultVirtualCluster: c.defaultVirtualCluster,
}
results, err := roachprod.RunWithDetails(
ctx, l, c.MakeNodes(nodes), "" /* SSHOptions */, "", /* processTag */
c.IsSecure(), args, options,
c.IsSecure(), args, options.WithExpanderConfig(expanderCfg),
)

var logFileFull string
Expand Down Expand Up @@ -3118,6 +3131,10 @@ func (c *clusterImpl) WipeForReuse(
// particular, this overwrites the reuse policy to reflect what the test
// intends to do with it.
c.spec = newClusterSpec
// Reset the default virtual cluster before running a new test on
// this cluster.
c.defaultVirtualCluster = ""

return nil
}

Expand Down
11 changes: 11 additions & 0 deletions pkg/cmd/roachtest/option/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,17 @@ func SkipInit(opts interface{}) {
}
}

// Tag sets a process tag when stopping processes. Useful if we want
// to kill a cockroach process that was started with `settings.TagOption`.
func Tag(tag string) func(opts interface{}) {
return func(opts interface{}) {
switch opts := opts.(type) {
case *StopOpts:
opts.RoachprodOpts.ProcessTag = tag
}
}
}

// WithInitTarget allows the caller to configure which node is used as
// `InitTarget` when starting cockroach. Specially useful when
// starting clusters in a subset of VMs in the cluster that doesn't
Expand Down
29 changes: 18 additions & 11 deletions pkg/cmd/roachtest/roachtestutil/clusterupgrade/clusterupgrade.go
Original file line number Diff line number Diff line change
Expand Up @@ -368,19 +368,26 @@ func RestartNodesWithNewBinary(
rand.Shuffle(len(nodes), func(i, j int) {
nodes[i], nodes[j] = nodes[j], nodes[i]
})

// Stop the cockroach process gracefully in order to drain it properly.
// This makes the upgrade closer to how users do it in production, but
// it's also needed to eliminate flakiness. In particular, this will
// make sure that DistSQL draining information is communicated through
// gossip so that other nodes running an older version don't consider
// this upgraded node for DistSQL plans (see #87154 for more details).
stopOptions := []option.StartStopOption{option.Graceful(gracePeriod)}

// If we are starting the cockroach process with a tag, we apply the
// same tag when stopping.
for _, s := range settings {
if t, ok := s.(install.TagOption); ok {
stopOptions = append(stopOptions, option.Tag(string(t)))
}
}

for _, node := range nodes {
l.Printf("restarting node %d into version %s", node, newVersion.String())
// Stop the cockroach process gracefully in order to drain it properly.
// This makes the upgrade closer to how users do it in production, but
// it's also needed to eliminate flakiness. In particular, this will
// make sure that DistSQL draining information is dissipated through
// gossip so that other nodes running an older version don't consider
// this upgraded node for DistSQL plans (see #87154 for more details).
// TODO(yuzefovich): ideally, we would also check that the drain was
// successful since if it wasn't, then we might see flakes too.
if err := c.StopE(
ctx, l, option.NewStopOpts(option.Graceful(gracePeriod)), c.Node(node),
); err != nil {
if err := c.StopE(ctx, l, option.NewStopOpts(stopOptions...), c.Node(node)); err != nil {
return err
}

Expand Down
2 changes: 2 additions & 0 deletions pkg/cmd/roachtest/roachtestutil/mixedversion/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ go_library(
"//pkg/cmd/roachtest/registry",
"//pkg/cmd/roachtest/roachtestutil",
"//pkg/cmd/roachtest/roachtestutil/clusterupgrade",
"//pkg/cmd/roachtest/spec",
"//pkg/cmd/roachtest/test",
"//pkg/roachpb",
"//pkg/roachprod/install",
Expand Down Expand Up @@ -56,6 +57,7 @@ go_test(
"//pkg/cmd/roachtest/registry",
"//pkg/cmd/roachtest/roachtestutil",
"//pkg/cmd/roachtest/roachtestutil/clusterupgrade",
"//pkg/cmd/roachtest/spec",
"//pkg/roachpb",
"//pkg/roachprod/install",
"//pkg/roachprod/logger",
Expand Down
23 changes: 4 additions & 19 deletions pkg/cmd/roachtest/roachtestutil/mixedversion/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,10 @@ func (sc *ServiceContext) changeVersion(node int, v *clusterupgrade.Version) err
return nil
}

func (sc *ServiceContext) IsSystem() bool {
return sc.Descriptor.Name == install.SystemInterfaceName
}

// NodeVersion returns the release version the given `node` is
// currently running. Returns an error if the node is not valid (i.e.,
// the underlying service is not deployed on the node passed).
Expand Down Expand Up @@ -267,25 +271,6 @@ func (c *Context) DefaultService() *ServiceContext {
return c.Tenant
}

// SetFinalizing sets the `Finalizing` field on all services
// available.
func (c *Context) SetFinalizing(b bool) {
c.forEachService(func(s *ServiceContext) { s.Finalizing = b })
}

// SetStage is a helper function to set the upgrade stage on all
// services available.
func (c *Context) SetStage(stage UpgradeStage) {
c.forEachService(func(s *ServiceContext) { s.Stage = stage })
}

func (c *Context) forEachService(f func(*ServiceContext)) {
f(c.System)
if c.Tenant != nil {
f(c.Tenant)
}
}

// clone copies the caller Context and returns the copy.
func (c *Context) clone() Context {
return Context{
Expand Down
4 changes: 4 additions & 0 deletions pkg/cmd/roachtest/roachtestutil/mixedversion/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,10 @@ func (h *Helper) IsMultitenant() bool {
return h.Tenant != nil
}

func (h *Helper) DeploymentMode() DeploymentMode {
return h.runner.plan.deploymentMode
}

func (h *Helper) DefaultService() *Service {
if h.Tenant != nil {
return h.Tenant
Expand Down
Loading

0 comments on commit 008333e

Please sign in to comment.