From bee2ec5ad40f29f068873bd6e2c6b06264166a74 Mon Sep 17 00:00:00 2001 From: irfan sharif Date: Fri, 10 Jul 2020 23:17:40 -0400 Subject: [PATCH] roach{test,prod},acceptance: no longer rely on auto-init `cockroach start` without `--join` auto-initializes the cluster. This was deprecated in 19.2 and will be removed in a future commit. We update test code that relies on the previous behavior. To (mostly) retain existing roachprod behavior, we auto-initialize when `roachprod start`-ing n1. We introduce a new `--skip-init` flag for when that is not desired (for example in roachtests that restart n1, but don't intend to re-initialize the cluster). Most of the roachtest changes are exactly to this effect, where tests than invoke `roachprod start` a second time on n1 are changed to now specify this new `--skip-init` flag. Release note: None --- pkg/acceptance/cluster/dockercluster.go | 8 +- pkg/acceptance/cluster/testconfig.pb.go | 80 +++++++------- pkg/acceptance/cluster/testconfig.proto | 4 +- pkg/cmd/roachprod/install/cockroach.go | 137 ++++++++++++++++-------- pkg/cmd/roachprod/main.go | 4 + pkg/cmd/roachtest/autoupgrade.go | 2 +- pkg/cmd/roachtest/clearrange.go | 4 +- pkg/cmd/roachtest/cli.go | 2 +- pkg/cmd/roachtest/clock_jump_crash.go | 2 +- pkg/cmd/roachtest/cluster.go | 10 +- pkg/cmd/roachtest/cluster_init.go | 38 +------ pkg/cmd/roachtest/decommission.go | 8 +- pkg/cmd/roachtest/election.go | 2 +- pkg/cmd/roachtest/encryption.go | 2 +- pkg/cmd/roachtest/engine_switch.go | 2 +- pkg/cmd/roachtest/gossip.go | 8 +- pkg/cmd/roachtest/inconsistency.go | 2 +- pkg/cmd/roachtest/kv.go | 4 +- pkg/cmd/roachtest/quit.go | 4 +- pkg/cmd/roachtest/replicagc.go | 2 +- pkg/cmd/roachtest/tpcc.go | 2 +- pkg/cmd/roachtest/version.go | 6 +- pkg/cmd/roachtest/versionupgrade.go | 4 +- 23 files changed, 180 insertions(+), 157 deletions(-) diff --git a/pkg/acceptance/cluster/dockercluster.go b/pkg/acceptance/cluster/dockercluster.go index 654491c6dadd..cadb8397a783 100644 --- a/pkg/acceptance/cluster/dockercluster.go +++ b/pkg/acceptance/cluster/dockercluster.go @@ -480,10 +480,12 @@ func (l *DockerCluster) startNode(ctx context.Context, node *testNode) { } cmd = append(cmd, fmt.Sprintf("--store=%s", storeSpec)) } - // Append --join flag (for all nodes except first in bootstrap-node-zero mode) - if node.index > 0 || l.config.InitMode != INIT_BOOTSTRAP_NODE_ZERO { - cmd = append(cmd, "--join="+net.JoinHostPort(l.Nodes[0].nodeStr, base.DefaultPort)) + // Append --join flag for all nodes. + firstNodeAddr := "" + if node.index > 0 { + firstNodeAddr = l.Nodes[0].nodeStr } + cmd = append(cmd, "--join="+net.JoinHostPort(firstNodeAddr, base.DefaultPort)) dockerLogDir := "/logs/" + node.nodeStr localLogDir := filepath.Join(l.volumesDir, "logs", node.nodeStr) diff --git a/pkg/acceptance/cluster/testconfig.pb.go b/pkg/acceptance/cluster/testconfig.pb.go index a2eecc82c2e0..9825aff03499 100644 --- a/pkg/acceptance/cluster/testconfig.pb.go +++ b/pkg/acceptance/cluster/testconfig.pb.go @@ -30,9 +30,6 @@ const ( // INIT_COMMAND starts every node with a join flag and issues the // init command. INIT_COMMAND InitMode = 0 - // INIT_BOOTSTRAP_NODE_ZERO uses the legacy protocol of omitting the - // join flag from node zero. - INIT_BOOTSTRAP_NODE_ZERO InitMode = 1 // INIT_NONE starts every node with a join flag and leaves the // cluster uninitialized. INIT_NONE InitMode = 2 @@ -40,13 +37,11 @@ const ( var InitMode_name = map[int32]string{ 0: "INIT_COMMAND", - 1: "INIT_BOOTSTRAP_NODE_ZERO", 2: "INIT_NONE", } var InitMode_value = map[string]int32{ - "INIT_COMMAND": 0, - "INIT_BOOTSTRAP_NODE_ZERO": 1, - "INIT_NONE": 2, + "INIT_COMMAND": 0, + "INIT_NONE": 2, } func (x InitMode) Enum() *InitMode { @@ -66,7 +61,7 @@ func (x *InitMode) UnmarshalJSON(data []byte) error { return nil } func (InitMode) EnumDescriptor() ([]byte, []int) { - return fileDescriptor_testconfig_af28e3620e9a1d6b, []int{0} + return fileDescriptor_testconfig_a82c0aa336d029cb, []int{0} } // StoreConfig holds the configuration of a collection of similar stores. @@ -78,7 +73,7 @@ func (m *StoreConfig) Reset() { *m = StoreConfig{} } func (m *StoreConfig) String() string { return proto.CompactTextString(m) } func (*StoreConfig) ProtoMessage() {} func (*StoreConfig) Descriptor() ([]byte, []int) { - return fileDescriptor_testconfig_af28e3620e9a1d6b, []int{0} + return fileDescriptor_testconfig_a82c0aa336d029cb, []int{0} } func (m *StoreConfig) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -113,7 +108,7 @@ func (m *NodeConfig) Reset() { *m = NodeConfig{} } func (m *NodeConfig) String() string { return proto.CompactTextString(m) } func (*NodeConfig) ProtoMessage() {} func (*NodeConfig) Descriptor() ([]byte, []int) { - return fileDescriptor_testconfig_af28e3620e9a1d6b, []int{1} + return fileDescriptor_testconfig_a82c0aa336d029cb, []int{1} } func (m *NodeConfig) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -154,7 +149,7 @@ func (m *TestConfig) Reset() { *m = TestConfig{} } func (m *TestConfig) String() string { return proto.CompactTextString(m) } func (*TestConfig) ProtoMessage() {} func (*TestConfig) Descriptor() ([]byte, []int) { - return fileDescriptor_testconfig_af28e3620e9a1d6b, []int{2} + return fileDescriptor_testconfig_a82c0aa336d029cb, []int{2} } func (m *TestConfig) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -810,36 +805,35 @@ var ( ) func init() { - proto.RegisterFile("acceptance/cluster/testconfig.proto", fileDescriptor_testconfig_af28e3620e9a1d6b) -} - -var fileDescriptor_testconfig_af28e3620e9a1d6b = []byte{ - // 422 bytes of a gzipped FileDescriptorProto - 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x7c, 0x90, 0xcd, 0x6e, 0xd3, 0x40, - 0x14, 0x85, 0x3d, 0xf9, 0x21, 0xc9, 0x2d, 0x45, 0xd1, 0x08, 0x24, 0xab, 0x2a, 0x53, 0x2b, 0x95, - 0x90, 0xcb, 0xc2, 0x11, 0x79, 0x83, 0xa6, 0x89, 0x90, 0x17, 0xb1, 0x91, 0x6b, 0x09, 0xa9, 0x1b, - 0x6b, 0x34, 0x1e, 0xcc, 0x08, 0x3c, 0x53, 0xd9, 0x13, 0xe8, 0x23, 0xc0, 0x8e, 0x77, 0xe0, 0x65, - 0xb2, 0xec, 0xb2, 0xab, 0x0a, 0x9c, 0xb7, 0x60, 0x85, 0xec, 0x8e, 0x1b, 0xd8, 0x64, 0x67, 0x9f, - 0x7b, 0xce, 0xf9, 0xee, 0x1d, 0x38, 0xa5, 0x8c, 0xf1, 0x6b, 0x4d, 0x25, 0xe3, 0x53, 0xf6, 0x79, - 0x5d, 0x6a, 0x5e, 0x4c, 0x35, 0x2f, 0x35, 0x53, 0xf2, 0x83, 0xc8, 0xbc, 0xeb, 0x42, 0x69, 0x85, - 0x8f, 0x99, 0x62, 0x9f, 0x0a, 0x45, 0xd9, 0x47, 0x6f, 0x67, 0xf7, 0x8c, 0xfd, 0xe8, 0x79, 0xa6, - 0x32, 0xd5, 0x18, 0xa7, 0xf5, 0xd7, 0x43, 0x66, 0x32, 0x83, 0x83, 0x4b, 0xad, 0x0a, 0x7e, 0xd1, - 0x14, 0xe1, 0x53, 0x80, 0x9c, 0xde, 0x24, 0x05, 0x95, 0x19, 0x2f, 0xed, 0x8e, 0x83, 0xdc, 0xfe, - 0xbc, 0xb7, 0xb9, 0x3f, 0xb1, 0xa2, 0x51, 0x4e, 0x6f, 0xa2, 0x46, 0x9e, 0xac, 0x01, 0x02, 0x95, - 0xb6, 0x11, 0x02, 0x83, 0x2f, 0xbc, 0x28, 0x85, 0x92, 0x36, 0x72, 0x90, 0x3b, 0x32, 0xfe, 0x56, - 0xc4, 0x6f, 0xe1, 0x49, 0x59, 0x13, 0xea, 0xba, 0xae, 0x7b, 0x30, 0x3b, 0xf3, 0xf6, 0xad, 0xe9, - 0xfd, 0xb3, 0x8d, 0x69, 0x32, 0xf1, 0xc9, 0xf7, 0x0e, 0x40, 0xcc, 0x4b, 0x6d, 0xb8, 0x36, 0xf4, - 0x24, 0xcd, 0xf9, 0x7f, 0xd0, 0x46, 0xc1, 0x0b, 0xe8, 0x4b, 0x95, 0x3e, 0x02, 0xdd, 0xfd, 0xc0, - 0xdd, 0x29, 0xa6, 0xe4, 0x21, 0x8c, 0xdf, 0xc0, 0x30, 0x5d, 0x17, 0x54, 0xd7, 0x87, 0x75, 0x1d, - 0xe4, 0x76, 0xe7, 0x2f, 0xea, 0xf1, 0x9f, 0xfb, 0x93, 0x43, 0x2d, 0x72, 0xee, 0x2d, 0xcc, 0x30, - 0x7a, 0xb4, 0x61, 0x1f, 0x46, 0x42, 0x0a, 0x9d, 0xe4, 0x2a, 0xe5, 0x76, 0xcf, 0x41, 0xee, 0xb3, - 0xd9, 0xab, 0xfd, 0x70, 0x5f, 0x0a, 0xbd, 0x52, 0x29, 0x37, 0xe8, 0xa1, 0x30, 0xff, 0xf8, 0x25, - 0x0c, 0xa4, 0x4a, 0xbe, 0x52, 0xa1, 0xed, 0xbe, 0x83, 0xdc, 0x61, 0xfb, 0x16, 0x52, 0xbd, 0xa7, - 0x42, 0xbf, 0x0e, 0x61, 0xd8, 0x46, 0xf1, 0x18, 0x9e, 0xfa, 0x81, 0x1f, 0x27, 0x17, 0xe1, 0x6a, - 0x75, 0x1e, 0x2c, 0xc6, 0x16, 0x3e, 0x06, 0xbb, 0x51, 0xe6, 0x61, 0x18, 0x5f, 0xc6, 0xd1, 0xf9, - 0xbb, 0x24, 0x08, 0x17, 0xcb, 0xe4, 0x6a, 0x19, 0x85, 0x63, 0x84, 0x0f, 0x61, 0xd4, 0x4c, 0x83, - 0x30, 0x58, 0x8e, 0x3b, 0x47, 0xbd, 0x6f, 0x3f, 0x89, 0x35, 0x3f, 0xdb, 0xfc, 0x26, 0xd6, 0xa6, - 0x22, 0xe8, 0xb6, 0x22, 0xe8, 0xae, 0x22, 0xe8, 0x57, 0x45, 0xd0, 0x8f, 0x2d, 0xb1, 0x6e, 0xb7, - 0xc4, 0xba, 0xdb, 0x12, 0xeb, 0x6a, 0x60, 0x76, 0xfe, 0x1b, 0x00, 0x00, 0xff, 0xff, 0x5a, 0x33, - 0x2b, 0x9d, 0x8c, 0x02, 0x00, 0x00, + proto.RegisterFile("acceptance/cluster/testconfig.proto", fileDescriptor_testconfig_a82c0aa336d029cb) +} + +var fileDescriptor_testconfig_a82c0aa336d029cb = []byte{ + // 405 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x7c, 0x90, 0xc1, 0x6e, 0xd3, 0x40, + 0x14, 0x45, 0x3d, 0x89, 0x43, 0xec, 0x57, 0x8a, 0xac, 0x11, 0x48, 0x56, 0x05, 0x53, 0xcb, 0x95, + 0x90, 0xcb, 0xc2, 0x11, 0xd9, 0xb0, 0x26, 0x0d, 0x42, 0x5e, 0xc4, 0x95, 0x42, 0x25, 0x24, 0x36, + 0xd6, 0x68, 0x3c, 0x98, 0x11, 0x78, 0xa6, 0xb2, 0x27, 0xd0, 0x4f, 0x80, 0x1d, 0xff, 0xc0, 0xcf, + 0x64, 0xd9, 0x65, 0x57, 0x15, 0x38, 0x7f, 0xc1, 0x0a, 0xd9, 0x1d, 0x27, 0x74, 0x93, 0x9d, 0x7d, + 0xdf, 0xbd, 0xf7, 0xbc, 0x37, 0x70, 0x42, 0x19, 0xe3, 0x97, 0x9a, 0x4a, 0xc6, 0x27, 0xec, 0xcb, + 0xaa, 0xd6, 0xbc, 0x9a, 0x68, 0x5e, 0x6b, 0xa6, 0xe4, 0x47, 0x51, 0xc4, 0x97, 0x95, 0xd2, 0x0a, + 0x3f, 0x65, 0x8a, 0x7d, 0xae, 0x14, 0x65, 0x9f, 0xe2, 0x9d, 0x3d, 0x36, 0xf6, 0xa3, 0xc7, 0x85, + 0x2a, 0x54, 0x67, 0x9c, 0xb4, 0x5f, 0x77, 0x99, 0x70, 0x0a, 0x07, 0xef, 0xb4, 0xaa, 0xf8, 0x59, + 0x57, 0x84, 0x4f, 0x00, 0x4a, 0x7a, 0x95, 0x55, 0x54, 0x16, 0xbc, 0xf6, 0x07, 0x01, 0x8a, 0x46, + 0x33, 0x7b, 0x7d, 0x7b, 0x6c, 0x2d, 0xdd, 0x92, 0x5e, 0x2d, 0x3b, 0x39, 0x5c, 0x01, 0xa4, 0x2a, + 0xef, 0x23, 0x04, 0xc6, 0x5f, 0x79, 0x55, 0x0b, 0x25, 0x7d, 0x14, 0xa0, 0xc8, 0x35, 0xfe, 0x5e, + 0xc4, 0x6f, 0xe1, 0x41, 0xdd, 0x12, 0xda, 0xba, 0x61, 0x74, 0x30, 0x3d, 0x8d, 0xf7, 0xad, 0x19, + 0xff, 0xb7, 0x8d, 0x69, 0x32, 0xf1, 0xf0, 0xc7, 0x00, 0xe0, 0x82, 0xd7, 0xda, 0x70, 0x7d, 0xb0, + 0x25, 0x2d, 0xf9, 0x3d, 0x68, 0xa7, 0xe0, 0x39, 0x8c, 0xa4, 0xca, 0xb7, 0xc0, 0x68, 0x3f, 0x70, + 0x77, 0x8a, 0x29, 0xb9, 0x0b, 0xe3, 0x97, 0xe0, 0xe4, 0xab, 0x8a, 0xea, 0xf6, 0xb0, 0x61, 0x80, + 0xa2, 0xe1, 0xec, 0x49, 0x3b, 0xfe, 0x7b, 0x7b, 0x7c, 0xa8, 0x45, 0xc9, 0xe3, 0xb9, 0x19, 0x2e, + 0xb7, 0x36, 0x9c, 0x80, 0x2b, 0xa4, 0xd0, 0x59, 0xa9, 0x72, 0xee, 0xdb, 0x01, 0x8a, 0x1e, 0x4d, + 0x9f, 0xef, 0x87, 0x27, 0x52, 0xe8, 0x85, 0xca, 0xb9, 0x41, 0x3b, 0xc2, 0xfc, 0xe3, 0x67, 0x30, + 0x96, 0x2a, 0xfb, 0x46, 0x85, 0xf6, 0x47, 0x01, 0x8a, 0x9c, 0xfe, 0x2d, 0xa4, 0x7a, 0x4f, 0x85, + 0x7e, 0xf1, 0x0a, 0x9c, 0x3e, 0x8a, 0x3d, 0x78, 0x98, 0xa4, 0xc9, 0x45, 0x76, 0x76, 0xbe, 0x58, + 0xbc, 0x4e, 0xe7, 0x9e, 0x85, 0x0f, 0xc1, 0xed, 0x94, 0xf4, 0x3c, 0x7d, 0xe3, 0x0d, 0x8e, 0xec, + 0xef, 0xbf, 0x88, 0x15, 0xda, 0x0e, 0xf2, 0xd0, 0xec, 0x74, 0xfd, 0x87, 0x58, 0xeb, 0x86, 0xa0, + 0xeb, 0x86, 0xa0, 0x9b, 0x86, 0xa0, 0xdf, 0x0d, 0x41, 0x3f, 0x37, 0xc4, 0xba, 0xde, 0x10, 0xeb, + 0x66, 0x43, 0xac, 0x0f, 0x63, 0xb3, 0xdb, 0xbf, 0x00, 0x00, 0x00, 0xff, 0xff, 0x09, 0x22, 0x2a, + 0xa7, 0x74, 0x02, 0x00, 0x00, } diff --git a/pkg/acceptance/cluster/testconfig.proto b/pkg/acceptance/cluster/testconfig.proto index d727cec1458f..cc09659e660e 100644 --- a/pkg/acceptance/cluster/testconfig.proto +++ b/pkg/acceptance/cluster/testconfig.proto @@ -22,9 +22,7 @@ enum InitMode { // init command. INIT_COMMAND = 0; - // INIT_BOOTSTRAP_NODE_ZERO uses the legacy protocol of omitting the - // join flag from node zero. - INIT_BOOTSTRAP_NODE_ZERO = 1; + reserved 1; // INIT_NONE starts every node with a join flag and leaves the // cluster uninitialized. diff --git a/pkg/cmd/roachprod/install/cockroach.go b/pkg/cmd/roachprod/install/cockroach.go index 7a865835c489..a5899dfaf6fc 100644 --- a/pkg/cmd/roachprod/install/cockroach.go +++ b/pkg/cmd/roachprod/install/cockroach.go @@ -30,6 +30,7 @@ import ( var StartOpts struct { Encrypt bool Sequential bool + SkipInit bool } // Cockroach TODO(peter): document @@ -112,17 +113,17 @@ func argExists(args []string, target string) int { // Start implements the ClusterImpl.NodeDir interface. func (r Cockroach) Start(c *SyncedCluster, extraArgs []string) { - // Check to see if node 1 was started indicating the cluster was + // Check to see if node 1 was started, indicating the cluster is to be // bootstrapped. - var bootstrapped bool + var bootstrappable bool for _, i := range c.ServerNodes() { if i == 1 { - bootstrapped = true + bootstrappable = true break } } - if c.Secure && bootstrapped { + if c.Secure && bootstrappable { c.DistributeCerts() } @@ -217,7 +218,8 @@ func (r Cockroach) Start(c *SyncedCluster, extraArgs []string) { args = append(args, "--locality="+locality) } } - if nodes[i] != 1 { + // `cockroach start` without `--join` is no longer supported as 20.1. + if nodes[i] != 1 || vers.AtLeast(version.MustParse("v20.1.0")) { args = append(args, fmt.Sprintf("--join=%s:%d", host1, r.NodePort(c, 1))) } if advertisePublicIP { @@ -271,11 +273,13 @@ func (r Cockroach) Start(c *SyncedCluster, extraArgs []string) { // unhelpful empty error (since everything has been redirected away). This is // unfortunately equally awkward to address. cmd := "ulimit -c unlimited; mkdir -p " + logDir + "; " + // TODO(peter): The ps and lslocks stuff is intended to debug why killing // of a cockroach process sometimes doesn't release file locks immediately. cmd += `echo ">>> roachprod start: $(date)" >> ` + logDir + "/roachprod.log; " + `ps axeww -o pid -o command >> ` + logDir + "/roachprod.log; " + `[ -x /usr/bin/lslocks ] && /usr/bin/lslocks >> ` + logDir + "/roachprod.log; " + cmd += keyCmd + fmt.Sprintf(" export ROACHPROD=%d%s && ", nodes[i], c.Tag) + "GOTRACEBACK=crash " + @@ -297,49 +301,96 @@ func (r Cockroach) Start(c *SyncedCluster, extraArgs []string) { return nil, nil }) - if bootstrapped { - license := envutil.EnvOrDefaultString("COCKROACH_DEV_LICENSE", "") - if license == "" { - fmt.Printf("%s: COCKROACH_DEV_LICENSE unset: enterprise features will be unavailable\n", - c.Name) + if StartOpts.SkipInit || !bootstrappable { + return + } + + var initOut string + display = fmt.Sprintf("%s: bootstrapping cluster", c.Name) + c.Parallel(display, 1, 0, func(i int) ([]byte, error) { + vers, err := getCockroachVersion(c, nodes[i]) + if err != nil { + return nil, err } + if !vers.AtLeast(version.MustParse("v20.1.0")) { + // `cockroach start` without `--join` is no longer supported as v20.1. + return nil, nil + } + sess, err := c.newSession(1) + if err != nil { + return nil, err + } + defer sess.Close() - var msg string - display = fmt.Sprintf("%s: initializing cluster settings", c.Name) - c.Parallel(display, 1, 0, func(i int) ([]byte, error) { - sess, err := c.newSession(1) - if err != nil { - return nil, err - } - defer sess.Close() + var cmd string + if c.IsLocal() { + cmd = `cd ${HOME}/local/1 ; ` + } - var cmd string - if c.IsLocal() { - cmd = `cd ${HOME}/local/1 ; ` - } - dir := c.Impl.NodeDir(c, nodes[i]) - cmd += ` -if ! test -e ` + dir + `/settings-initialized ; then - COCKROACH_CONNECT_TIMEOUT=0 ` + cockroachNodeBinary(c, 1) + " sql --url " + - r.NodeURL(c, "localhost", r.NodePort(c, 1)) + " -e " + - fmt.Sprintf(`" -SET CLUSTER SETTING server.remote_debugging.mode = 'any'; -SET CLUSTER SETTING cluster.organization = 'Cockroach Labs - Production Testing'; -SET CLUSTER SETTING enterprise.license = '%s';"`, license) + ` && - touch ` + dir + `/settings-initialized -fi -` - out, err := sess.CombinedOutput(cmd) - if err != nil { - return nil, errors.Wrapf(err, "~ %s\n%s", cmd, out) - } - msg = strings.TrimSpace(string(out)) - return nil, nil - }) + binary := cockroachNodeBinary(c, 1) + path := fmt.Sprintf("%s/%s", c.Impl.NodeDir(c, nodes[i]), "cluster-bootstrapped") + url := r.NodeURL(c, "localhost", r.NodePort(c, 1)) - if msg != "" { - fmt.Println(msg) + cmd += fmt.Sprintf(` + if ! test -e %s ; then + COCKROACH_CONNECT_TIMEOUT=0 %s init --url %s && touch %s + fi`, path, binary, url, path) + + out, err := sess.CombinedOutput(cmd) + if err != nil { + return nil, errors.Wrapf(err, "~ %s\n%s", cmd, out) + } + initOut = strings.TrimSpace(string(out)) + return nil, nil + }) + + if initOut != "" { + fmt.Println(initOut) + } + + license := envutil.EnvOrDefaultString("COCKROACH_DEV_LICENSE", "") + if license == "" { + fmt.Printf("%s: COCKROACH_DEV_LICENSE unset: enterprise features will be unavailable\n", + c.Name) + } + + var clusterSettingsOut string + display = fmt.Sprintf("%s: initializing cluster settings", c.Name) + c.Parallel(display, 1, 0, func(i int) ([]byte, error) { + sess, err := c.newSession(1) + if err != nil { + return nil, err + } + defer sess.Close() + + var cmd string + if c.IsLocal() { + cmd = `cd ${HOME}/local/1 ; ` } + + binary := cockroachNodeBinary(c, 1) + path := fmt.Sprintf("%s/%s", c.Impl.NodeDir(c, nodes[i]), "settings-initialized") + url := r.NodeURL(c, "localhost", r.NodePort(c, 1)) + + cmd += fmt.Sprintf(` + if ! test -e %s ; then + COCKROACH_CONNECT_TIMEOUT=0 %s sql --url %s -e " + SET CLUSTER SETTING server.remote_debugging.mode = 'any'; + SET CLUSTER SETTING cluster.organization = 'Cockroach Labs - Production Testing'; + SET CLUSTER SETTING enterprise.license = '%s';" \ + && touch %s + fi`, path, binary, url, license, path) + + out, err := sess.CombinedOutput(cmd) + if err != nil { + return nil, errors.Wrapf(err, "~ %s\n%s", cmd, out) + } + clusterSettingsOut = strings.TrimSpace(string(out)) + return nil, nil + }) + + if clusterSettingsOut != "" { + fmt.Println(clusterSettingsOut) } } diff --git a/pkg/cmd/roachprod/main.go b/pkg/cmd/roachprod/main.go index e5389d2fc400..fbc2520a35a5 100644 --- a/pkg/cmd/roachprod/main.go +++ b/pkg/cmd/roachprod/main.go @@ -91,6 +91,7 @@ var ( encrypt = false quiet = false sig = 9 + skipInit = false waitFlag = false stageOS string logsDir string @@ -1735,6 +1736,9 @@ func main() { &clusterType, "type", "t", clusterType, `cluster type ("cockroach" or "cassandra")`) cmd.Flags().BoolVar( &install.StartOpts.Encrypt, "encrypt", encrypt, "start nodes with encryption at rest turned on") + cmd.Flags().BoolVar( + &install.StartOpts.SkipInit, "skip-init", false, + "skip init step to bootstrap cockroach cluster") fallthrough case sqlCmd: cmd.Flags().StringVarP( diff --git a/pkg/cmd/roachtest/autoupgrade.go b/pkg/cmd/roachtest/autoupgrade.go index 9ffb596c4772..35cc777fbb65 100644 --- a/pkg/cmd/roachtest/autoupgrade.go +++ b/pkg/cmd/roachtest/autoupgrade.go @@ -131,7 +131,7 @@ func registerAutoUpgrade(r *testRegistry) { t.Fatal(err) } c.Put(ctx, cockroach, "./cockroach", c.Node(i)) - c.Start(ctx, t, c.Node(i), startArgsDontEncrypt) + c.Start(ctx, t, c.Node(i), startArgsDontEncrypt, startArgsSkipInit) if err := sleep(stageDuration); err != nil { t.Fatal(err) } diff --git a/pkg/cmd/roachtest/clearrange.go b/pkg/cmd/roachtest/clearrange.go index 270f9f313fa0..7dff0c7f3c57 100644 --- a/pkg/cmd/roachtest/clearrange.go +++ b/pkg/cmd/roachtest/clearrange.go @@ -55,11 +55,11 @@ func runClearRange(ctx context.Context, t *test, c *cluster, aggressiveChecks bo // This slows down merges, so it might hide some races. // // NB: the below invocation was found to actually make it to the server at the time of writing. - c.Start(ctx, t, startArgs( + c.Start(ctx, t, startArgsSkipInit, startArgs( "--env", "COCKROACH_CONSISTENCY_AGGRESSIVE=true COCKROACH_ENFORCE_CONSISTENT_STATS=true", )) } else { - c.Start(ctx, t) + c.Start(ctx, t, startArgsSkipInit) } // Also restore a much smaller table. We'll use it to run queries against diff --git a/pkg/cmd/roachtest/cli.go b/pkg/cmd/roachtest/cli.go index 555382f86d1a..2e56fab1f0b2 100644 --- a/pkg/cmd/roachtest/cli.go +++ b/pkg/cmd/roachtest/cli.go @@ -102,7 +102,7 @@ func runCLINodeStatus(ctx context.Context, t *test, c *cluster) { // Stop the cluster and restart only 2 of the nodes. Verify that three nodes // show up in the node status output. c.Stop(ctx, c.Range(1, 3)) - c.Start(ctx, t, c.Range(1, 2)) + c.Start(ctx, t, c.Range(1, 2), startArgsSkipInit) // Wait for the cluster to come back up. waitForFullReplication(t, db) diff --git a/pkg/cmd/roachtest/clock_jump_crash.go b/pkg/cmd/roachtest/clock_jump_crash.go index 80b02198c636..28a08c02b362 100644 --- a/pkg/cmd/roachtest/clock_jump_crash.go +++ b/pkg/cmd/roachtest/clock_jump_crash.go @@ -66,7 +66,7 @@ func runClockJump(ctx context.Context, t *test, c *cluster, tc clockJumpTestCase // restarting it if not. time.Sleep(3 * time.Second) if !isAlive(db, c.l) { - c.Start(ctx, t, c.Node(1)) + c.Start(ctx, t, c.Node(1), startArgsSkipInit) } }() defer offsetInjector.recover(ctx, c.spec.NodeCount) diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index c9dcfe6cb34a..e2e196b44252 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -1896,6 +1896,14 @@ func startArgs(extraArgs ...string) option { // encryption enabled. var startArgsDontEncrypt = startArgs("--encrypt=false") +// startArgsSkipInit will pass '--skip-init' to roachprod. +// +// NB: `roachprod start` attempts to auto-initialize the cluster when the target +// node is n1. It accepts the --skip-init flag to avoid doing so when needed, +// for example in roachtests that simply restart n1 (using `roachprod start`) +// without wanting to re-initialize the cluster. +var startArgsSkipInit = startArgs("--skip-init") + // racks is an option which specifies the number of racks to partition the nodes // into. func racks(n int) option { @@ -1939,7 +1947,7 @@ func (c *cluster) Restart(ctx context.Context, t *test, node nodeListOption) { var cancel func() ctx, cancel = context.WithTimeout(ctx, 30*time.Second) c.Stop(ctx, node) - c.Start(ctx, t, node) + c.Start(ctx, t, node, startArgsSkipInit) cancel() } diff --git a/pkg/cmd/roachtest/cluster_init.go b/pkg/cmd/roachtest/cluster_init.go index 66b709f8bc76..dad8194ee5d2 100644 --- a/pkg/cmd/roachtest/cluster_init.go +++ b/pkg/cmd/roachtest/cluster_init.go @@ -38,42 +38,8 @@ func runClusterInit(ctx context.Context, t *test, c *cluster) { t.Fatal("no address for first node") } - // Legacy-style init where we start node 1 without a join flag and then point - // the other nodes at it. - func() { - var g errgroup.Group - g.Go(func() error { - return c.RunE(ctx, c.Node(1), - `mkdir -p {log-dir} && `+ - `./cockroach start --insecure --background --store={store-dir} `+ - `--log-dir={log-dir} --cache=10% --max-sql-memory=10% `+ - `--listen-addr=:{pgport:1} --http-port=$[{pgport:1}+1] `+ - `> {log-dir}/cockroach.stdout 2> {log-dir}/cockroach.stderr`) - }) - for i := 2; i <= c.spec.NodeCount; i++ { - i := i - g.Go(func() error { - return c.RunE(ctx, c.Node(i), - fmt.Sprintf( - `mkdir -p {log-dir} && `+ - `./cockroach start --insecure --background --store={store-dir} `+ - `--log-dir={log-dir} --cache=10%% --max-sql-memory=10%% `+ - `--listen-addr=:{pgport:%[1]d} --http-port=$[{pgport:%[1]d}+1] `+ - `--join=`+addrs[0]+ - `> {log-dir}/cockroach.stdout 2> {log-dir}/cockroach.stderr`, i)) - }) - } - if err := g.Wait(); err != nil { - t.Fatal(err) - } - - db := c.Conn(ctx, 1) - defer db.Close() - waitForFullReplication(t, db) - }() - - // New-style init where we start all nodes with the same join flags and then - // issue an "init" command to one of the nodes. + // We start all nodes with the same join flags and then issue an "init" + // command to one of the nodes. for _, initNode := range []int{1, 2} { c.Wipe(ctx) diff --git a/pkg/cmd/roachtest/decommission.go b/pkg/cmd/roachtest/decommission.go index a8513e1d2328..63ce8c0c2305 100644 --- a/pkg/cmd/roachtest/decommission.go +++ b/pkg/cmd/roachtest/decommission.go @@ -214,8 +214,8 @@ func runDecommission(t *test, c *cluster, nodes int, duration time.Duration) { db := c.Conn(ctx, 1) defer db.Close() - c.Start(ctx, t, c.Node(node), startArgs(fmt.Sprintf("-a=--join %s --attrs=node%d", - c.InternalAddr(ctx, c.Node(nodes))[0], node))) + c.Start(ctx, t, c.Node(node), startArgsSkipInit, startArgs( + fmt.Sprintf("-a=--join %s --attrs=node%d", c.InternalAddr(ctx, c.Node(nodes))[0], node))) } // TODO(tschottdorf): run some ui sanity checks about decommissioned nodes // having disappeared. Verify that the workloads don't dip their qps or @@ -544,7 +544,7 @@ func runDecommissionAcceptance(ctx context.Context, t *test, c *cluster) { if err != nil { t.Fatalf("decommission failed: %v", err) } - c.Start(ctx, t, c.Node(1), args) + c.Start(ctx, t, c.Node(1), args, startArgsSkipInit) // Run a second time to wait until the replicas have all been GC'ed. // Note that we specify "all" because even though the first node is @@ -649,7 +649,7 @@ func runDecommissionAcceptance(ctx context.Context, t *test, c *cluster) { // with an address belonging to an old decommissioned node. { c.Wipe(ctx, c.Node(1)) - c.Start(ctx, t, c.Node(1), startArgs(fmt.Sprintf("-a=--join %s", + c.Start(ctx, t, c.Node(1), startArgsSkipInit, startArgs(fmt.Sprintf("-a=--join %s", c.InternalAddr(ctx, c.Node(2))[0]))) } diff --git a/pkg/cmd/roachtest/election.go b/pkg/cmd/roachtest/election.go index e243e89d1383..ecdac24112f3 100644 --- a/pkg/cmd/roachtest/election.go +++ b/pkg/cmd/roachtest/election.go @@ -57,7 +57,7 @@ func registerElectionAfterRestart(r *testRegistry) { t.Status("restarting") c.Stop(ctx) - c.Start(ctx, t) + c.Start(ctx, t, startArgsSkipInit) // Each of the 100 ranges in this table must elect a leader for // this query to complete. In naive raft, each of these diff --git a/pkg/cmd/roachtest/encryption.go b/pkg/cmd/roachtest/encryption.go index 2ccf9662bb92..a0892812feef 100644 --- a/pkg/cmd/roachtest/encryption.go +++ b/pkg/cmd/roachtest/encryption.go @@ -40,7 +40,7 @@ func registerEncryption(r *testRegistry) { } // Restart node with encryption turned on to verify old key works. - c.Start(ctx, t, c.Range(1, nodes), startArgs("--encrypt")) + c.Start(ctx, t, c.Range(1, nodes), startArgsSkipInit, startArgs("--encrypt")) testCLIGenKey := func(size int) error { // Generate encryption store key through `./cockroach gen encryption-key -s=size aes-size.key`. diff --git a/pkg/cmd/roachtest/engine_switch.go b/pkg/cmd/roachtest/engine_switch.go index 51587147bbaf..16aefc987900 100644 --- a/pkg/cmd/roachtest/engine_switch.go +++ b/pkg/cmd/roachtest/engine_switch.go @@ -119,7 +119,7 @@ func registerEngineSwitch(r *testRegistry) { if err := stop(i + 1); err != nil { return err } - c.Start(ctx, t, c.Node(i+1), args) + c.Start(ctx, t, c.Node(i+1), startArgsSkipInit, args) } return sleepAndCheck() }) diff --git a/pkg/cmd/roachtest/gossip.go b/pkg/cmd/roachtest/gossip.go index 18e3dc2b21ef..ba7cb257f6d1 100644 --- a/pkg/cmd/roachtest/gossip.go +++ b/pkg/cmd/roachtest/gossip.go @@ -112,7 +112,7 @@ SELECT string_agg(source_id::TEXT || ':' || target_id::TEXT, ',') deadNode = nodes.randNode()[0] c.Stop(ctx, c.Node(deadNode)) waitForGossip() - c.Start(ctx, t, c.Node(deadNode), args) + c.Start(ctx, t, c.Node(deadNode), startArgsSkipInit, args) } } @@ -270,7 +270,7 @@ func runGossipPeerings(ctx context.Context, t *test, c *cluster) { node := c.All().randNode() t.l.Printf("%d: restarting node %d\n", i, node[0]) c.Stop(ctx, node) - c.Start(ctx, t, node) + c.Start(ctx, t, node, startArgsSkipInit) } } @@ -294,7 +294,7 @@ func runGossipRestart(ctx context.Context, t *test, c *cluster) { c.Stop(ctx) t.l.Printf("%d: restarting all nodes\n", i) - c.Start(ctx, t) + c.Start(ctx, t, startArgsSkipInit) } } @@ -441,7 +441,7 @@ SELECT count(replicas) // Stop our special snowflake process which won't be recognized by the test // harness, and start it again on the regular. c.Stop(ctx, c.Node(1)) - c.Start(ctx, t, c.Node(1)) + c.Start(ctx, t, c.Node(1), startArgsSkipInit) } func runCheckLocalityIPAddress(ctx context.Context, t *test, c *cluster) { diff --git a/pkg/cmd/roachtest/inconsistency.go b/pkg/cmd/roachtest/inconsistency.go index 5183fb0c5626..869ca6586c3d 100644 --- a/pkg/cmd/roachtest/inconsistency.go +++ b/pkg/cmd/roachtest/inconsistency.go @@ -77,7 +77,7 @@ func runInconsistency(ctx context.Context, t *test, c *cluster) { "0x12040800100018002000280032280a10000000000000000000000000000000001a1066616b65207472616e73616374696f6e2a004a00") m := newMonitor(ctx, c) - c.Start(ctx, t, nodes) + c.Start(ctx, t, nodes, startArgsSkipInit) m.Go(func(ctx context.Context) error { select { case <-time.After(5 * time.Minute): diff --git a/pkg/cmd/roachtest/kv.go b/pkg/cmd/roachtest/kv.go index 00af38a2f593..60cb388725bb 100644 --- a/pkg/cmd/roachtest/kv.go +++ b/pkg/cmd/roachtest/kv.go @@ -341,7 +341,7 @@ func registerKVQuiescenceDead(r *testRegistry) { ) } t.l.Printf("QPS went from %.2f to %2.f with one node down\n", qpsAllUp, qpsOneDown) - c.Start(ctx, t, c.Node(nodes)) // satisfy dead node detector + c.Start(ctx, t, c.Node(nodes), startArgsSkipInit) // satisfy dead node detector }, }) } @@ -399,7 +399,7 @@ func registerKVGracefulDraining(r *testRegistry) { return nil case <-time.After(1 * time.Minute): } - c.Start(ctx, t, c.Node(nodes)) + c.Start(ctx, t, c.Node(nodes), startArgsSkipInit) } return nil }) diff --git a/pkg/cmd/roachtest/quit.go b/pkg/cmd/roachtest/quit.go index 01b014ddbd12..c8f2d5903433 100644 --- a/pkg/cmd/roachtest/quit.go +++ b/pkg/cmd/roachtest/quit.go @@ -93,7 +93,7 @@ func (q *quitTest) runTest( // restartNode restarts one node and waits until it's up and ready to // accept clients. func (q *quitTest) restartNode(ctx context.Context, nodeID int) { - q.c.Start(ctx, q.t, q.args, q.c.Node(nodeID)) + q.c.Start(ctx, q.t, q.args, q.c.Node(nodeID), startArgsSkipInit) q.t.l.Printf("waiting for readiness of node %d\n", nodeID) // Now perform a SQL query. This achieves two goals: @@ -450,7 +450,7 @@ func registerQuitAllNodes(r *testRegistry) { // At the end, restart all nodes. We do this to check that // the cluster can indeed restart, and also to please // the dead node detection check at the end of each test. - q.c.Start(ctx, q.t, q.args) + q.c.Start(ctx, q.t, q.args, startArgsSkipInit) }, }) } diff --git a/pkg/cmd/roachtest/replicagc.go b/pkg/cmd/roachtest/replicagc.go index cb6169bc012d..48c4da3789ad 100644 --- a/pkg/cmd/roachtest/replicagc.go +++ b/pkg/cmd/roachtest/replicagc.go @@ -161,5 +161,5 @@ func runReplicaGCChangedPeers(ctx context.Context, t *test, c *cluster, withRest } // Restart the remaining nodes to satisfy the dead node detector. - c.Start(ctx, t, c.Range(1, 2)) + c.Start(ctx, t, c.Range(1, 2), startArgsSkipInit) } diff --git a/pkg/cmd/roachtest/tpcc.go b/pkg/cmd/roachtest/tpcc.go index 4e2165c3a8c0..21413dbcb501 100644 --- a/pkg/cmd/roachtest/tpcc.go +++ b/pkg/cmd/roachtest/tpcc.go @@ -794,7 +794,7 @@ func runTPCCBench(ctx context.Context, t *test, c *cluster, b tpccBenchSpec) { // inter-trial interactions. m.ExpectDeaths(int32(len(roachNodes))) c.Stop(ctx, roachNodes) - c.Start(ctx, t, append(b.startOpts(), roachNodes)...) + c.Start(ctx, t, append(b.startOpts(), []option{roachNodes, startArgsSkipInit}...)...) time.Sleep(restartWait) // Set up the load generation configuration. diff --git a/pkg/cmd/roachtest/version.go b/pkg/cmd/roachtest/version.go index f5637aed4b20..e241f4250546 100644 --- a/pkg/cmd/roachtest/version.go +++ b/pkg/cmd/roachtest/version.go @@ -140,7 +140,7 @@ func registerVersion(r *testRegistry) { return err } c.Put(ctx, cockroach, "./cockroach", c.Node(i)) - c.Start(ctx, t, c.Node(i), startArgsDontEncrypt) + c.Start(ctx, t, c.Node(i), startArgsDontEncrypt, startArgsSkipInit) if err := sleepAndCheck(); err != nil { return err } @@ -177,7 +177,7 @@ func registerVersion(r *testRegistry) { return err } c.Put(ctx, b, "./cockroach", c.Node(i)) - c.Start(ctx, t, c.Node(i), startArgsDontEncrypt) + c.Start(ctx, t, c.Node(i), startArgsDontEncrypt, startArgsSkipInit) if err := sleepAndCheck(); err != nil { return err } @@ -191,7 +191,7 @@ func registerVersion(r *testRegistry) { return err } c.Put(ctx, cockroach, "./cockroach", c.Node(i)) - c.Start(ctx, t, c.Node(i), startArgsDontEncrypt) + c.Start(ctx, t, c.Node(i), startArgsDontEncrypt, startArgsSkipInit) if err := sleepAndCheck(); err != nil { return err } diff --git a/pkg/cmd/roachtest/versionupgrade.go b/pkg/cmd/roachtest/versionupgrade.go index 107eab6215cf..92e35ebaf8ed 100644 --- a/pkg/cmd/roachtest/versionupgrade.go +++ b/pkg/cmd/roachtest/versionupgrade.go @@ -292,7 +292,7 @@ func uploadAndStartFromCheckpointFixture(nodes nodeListOption, v string) version // Put and start the binary. args := u.uploadVersion(ctx, t, nodes, v) // NB: can't start sequentially since cluster already bootstrapped. - u.c.Start(ctx, t, nodes, args, startArgsDontEncrypt, roachprodArgOption{"--sequential=false"}) + u.c.Start(ctx, t, nodes, args, startArgsDontEncrypt, roachprodArgOption{"--sequential=false"}, startArgsSkipInit) } } @@ -312,7 +312,7 @@ func binaryUpgradeStep(nodes nodeListOption, newVersion string) versionStep { for _, node := range nodes { t.l.Printf("restarting node %d", node) c.Stop(ctx, c.Node(node)) - c.Start(ctx, t, c.Node(node), args, startArgsDontEncrypt) + c.Start(ctx, t, c.Node(node), args, startArgsDontEncrypt, startArgsSkipInit) t.l.Printf("node %d now running binary version %s", node, u.binaryVersion(ctx, t, node)) // TODO(nvanbenschoten): add upgrade qualification step. What should we