From 32261e284f36dd59948f7d62d584f532554640af Mon Sep 17 00:00:00 2001 From: Stan Rosenberg Date: Wed, 17 May 2023 11:05:21 -0400 Subject: [PATCH] roachtest: metamorphic ARM64 and FIPS clusters Previously, all roachtests used (cloud) machine types with the AMD64 (cpu) architecture. Recently [1], new CI infrastructure was added to run a clone of all the nightly roachtests, configured with FIPS; i.e., same AMD64 machine types, different AMI and crdb binary, patched with FIPS-certified openssl native code. As of this PR, we add the capability to execute any roachtest in a cluster, configured with either ARM64, FIPS, or AMD64 (default). This is controlled via the two CLI args: `metamorphic-arm64-probability` and `metamorphic-fips-probability`. The former denotes the probability (over the uniform distribution) of a new cluster provisioned using ARM64 VMs. The latter denotes the probability of a new AMD64 cluster provisioned with the FIPS-compliant (kernel) configuration. In case a test is compatible only with AMD64, it's effectively excluded from the set; i.e., both probabilities apply to compatible tests only. Note, the two probabilties don't have to add up to 1. E.g., `metamorphic-arm64-probability==0.4`, `metamorphic-fips-probability==0.2` denotes that ARM64 clusters are chosen ~40% of the time, whereas of the remaining ~60% AMD clusters, FIPS is chosen ~20% of the time; i.e., ~12% of all clusters will use FIPS. Note, the values '0' and '1' are absolute. Setting both to '0' is tantamount to the behavior before this PR. Setting either to '1' enforces _all_ clusters are provisioned with either ARM64 or FIPS. A test can specify its required architecture, in which case, it takes precedence over metamorphic settings. This PR builds on [1], which enabled ARM64 provisioning for AWS in roachprod. We add ARM64 provisioning for GCE, i.e., T2A, as well as refactor 'arch' argument to denote one of: AMD64, ARM64, FIPS, where the latter isn't formally a CPU architecture; however, it simplifies provisioning and binary staging. We also modify roachprod.List to display CPU architecture, other than AMD64, with the machine type; this should make it easier to see which clusters are running ARM64 and FIPS configurations, as we ramp up their testing. The PR also adds validation to cockroach binaries and libs to ensure we can execute tests under ARM64 and FIPS. Furthermore, we add 'Enabled Assertions' header, generated at build time, to the cockroach binary; the header is used to validate whether or not the binary has runtime assertions enabled. Epic: none Release note: None Resolves: https://github.com/cockroachdb/cockroach/issues/94957 Resolves: https://github.com/cockroachdb/cockroach/issues/89268 Informs: https://github.com/cockroachdb/cockroach/issues/94986 [1] https://github.com/cockroachdb/cockroach/pull/99224 [2] https://github.com/cockroachdb/cockroach/pull/103243 --- pkg/build/BUILD.bazel | 1 + pkg/build/info.go | 36 +- pkg/build/info.proto | 2 + pkg/cmd/roachprod/flags.go | 5 +- pkg/cmd/roachprod/main.go | 18 +- pkg/cmd/roachtest/cluster.go | 372 +++++++++++++----- .../roachtest/cluster/cluster_interface.go | 3 + pkg/cmd/roachtest/cluster_test.go | 21 +- pkg/cmd/roachtest/github.go | 10 +- pkg/cmd/roachtest/github_test.go | 15 +- pkg/cmd/roachtest/main.go | 74 +++- .../mixedversion/mixedversion.go | 2 +- pkg/cmd/roachtest/slack.go | 2 - pkg/cmd/roachtest/spec/cluster_spec.go | 28 +- pkg/cmd/roachtest/spec/machine_type.go | 52 ++- pkg/cmd/roachtest/spec/option.go | 17 +- pkg/cmd/roachtest/test_impl.go | 1 + pkg/cmd/roachtest/test_registry_test.go | 7 + pkg/cmd/roachtest/test_runner.go | 76 +++- pkg/cmd/roachtest/test_test.go | 1 + pkg/cmd/roachtest/tests/autoupgrade.go | 4 - pkg/cmd/roachtest/tests/cdc.go | 12 +- pkg/cmd/roachtest/tests/cluster_to_cluster.go | 4 +- pkg/cmd/roachtest/tests/decommission.go | 4 - pkg/cmd/roachtest/tests/follower_reads.go | 4 - pkg/cmd/roachtest/tests/import.go | 4 - .../roachtest/tests/mixed_version_backup.go | 6 +- pkg/cmd/roachtest/tests/mixed_version_cdc.go | 12 +- .../mixed_version_decl_schemachange_compat.go | 6 +- ...atibility_in_declarative_schema_changer.go | 4 - pkg/cmd/roachtest/tests/mixed_version_jobs.go | 4 - .../tests/mixed_version_schemachange.go | 4 - pkg/cmd/roachtest/tests/rebalance_load.go | 4 - pkg/cmd/roachtest/tests/restore.go | 6 +- pkg/cmd/roachtest/tests/secondary_indexes.go | 4 - pkg/cmd/roachtest/tests/tpcc.go | 14 +- ...ate_system_schema_after_version_upgrade.go | 4 - pkg/cmd/roachtest/tests/version.go | 4 - pkg/cmd/roachtest/tests/versionupgrade.go | 3 - pkg/roachprod/install/BUILD.bazel | 1 + pkg/roachprod/install/staging.go | 19 +- pkg/roachprod/install/staging_test.go | 3 +- pkg/roachprod/roachprod.go | 4 +- pkg/roachprod/vm/aws/aws.go | 26 +- pkg/roachprod/vm/gce/gcloud.go | 29 +- pkg/roachprod/vm/vm.go | 14 +- pkg/util/randutil/rand.go | 6 + 47 files changed, 666 insertions(+), 286 deletions(-) diff --git a/pkg/build/BUILD.bazel b/pkg/build/BUILD.bazel index db1d660daa0b..a4b25782961c 100644 --- a/pkg/build/BUILD.bazel +++ b/pkg/build/BUILD.bazel @@ -24,6 +24,7 @@ go_library( "github.com/cockroachdb/cockroach/pkg/build.utcTime": "{BUILD_UTCTIME}", }, deps = [ + "//pkg/util/buildutil", "//pkg/util/envutil", "//pkg/util/version", ], diff --git a/pkg/build/info.go b/pkg/build/info.go index d7b249dc21ac..8d0a7cffc9b7 100644 --- a/pkg/build/info.go +++ b/pkg/build/info.go @@ -19,6 +19,7 @@ import ( "text/tabwriter" "time" + "github.com/cockroachdb/cockroach/pkg/util/buildutil" "github.com/cockroachdb/cockroach/pkg/util/envutil" "github.com/cockroachdb/cockroach/pkg/util/version" ) @@ -37,10 +38,11 @@ var ( cgoTargetTriple string platform = fmt.Sprintf("%s %s", runtime.GOOS, runtime.GOARCH) // Distribution is changed by the CCL init-time hook in non-APL builds. - Distribution = "OSS" - typ string // Type of this build: , "development", or "release" - channel string - envChannel = envutil.EnvOrDefaultString("COCKROACH_CHANNEL", "unknown") + Distribution = "OSS" + typ string // Type of this build: , "development", or "release" + channel string + envChannel = envutil.EnvOrDefaultString("COCKROACH_CHANNEL", "unknown") + enabledAssertions = buildutil.CrdbTestBuild //go:embed version.txt cockroachVersion string binaryVersion = computeBinaryVersion(cockroachVersion, rev) @@ -127,7 +129,8 @@ func (b Info) Long() string { fmt.Fprintf(tw, "Go Version: %s\n", b.GoVersion) fmt.Fprintf(tw, "C Compiler: %s\n", b.CgoCompiler) fmt.Fprintf(tw, "Build Commit ID: %s\n", b.Revision) - fmt.Fprintf(tw, "Build Type: %s", b.Type) // No final newline: cobra prints one for us. + fmt.Fprintf(tw, "Build Type: %s\n", b.Type) + fmt.Fprintf(tw, "Enabled Assertions: %t", b.EnabledAssertions) // No final newline: cobra prints one for us. _ = tw.Flush() return buf.String() } @@ -157,17 +160,18 @@ func GetInfo() Info { ch = "unknown" } return Info{ - GoVersion: runtime.Version(), - Tag: binaryVersion, - Time: utcTime, - Revision: rev, - CgoCompiler: cgoCompiler, - CgoTargetTriple: cgoTargetTriple, - Platform: platform, - Distribution: Distribution, - Type: typ, - Channel: ch, - EnvChannel: envChannel, + GoVersion: runtime.Version(), + Tag: binaryVersion, + Time: utcTime, + Revision: rev, + CgoCompiler: cgoCompiler, + CgoTargetTriple: cgoTargetTriple, + Platform: platform, + Distribution: Distribution, + Type: typ, + Channel: ch, + EnvChannel: envChannel, + EnabledAssertions: enabledAssertions, } } diff --git a/pkg/build/info.proto b/pkg/build/info.proto index 22d801551127..8e3e18a3a12f 100644 --- a/pkg/build/info.proto +++ b/pkg/build/info.proto @@ -40,6 +40,8 @@ message Info { optional string channel = 9 [(gogoproto.nullable) = false]; // env_channel identifies the product channel as overridden by the COCKROACH_CHANNEL environment variable. optional string env_channel = 11 [(gogoproto.nullable) = false]; + // enabled_assertions returns the value of 'CrdbTestBuild' (true iff compiled with 'crdb_test' tag) + optional bool enabled_assertions = 12 [(gogoproto.nullable) = false]; // dependencies exists to allow tests that run against old clusters // to unmarshal JSON containing this field. The tag is unimportant, diff --git a/pkg/cmd/roachprod/flags.go b/pkg/cmd/roachprod/flags.go index ea4fab89e132..61b15239619d 100644 --- a/pkg/cmd/roachprod/flags.go +++ b/pkg/cmd/roachprod/flags.go @@ -107,8 +107,9 @@ func initFlags() { vm.AllProviderNames())) createCmd.Flags().BoolVar(&createVMOpts.GeoDistributed, "geo", false, "Create geo-distributed cluster") - createCmd.Flags().BoolVar(&createVMOpts.EnableFIPS, - "fips", false, "Enable FIPS mode (uses custom AMI)") + createCmd.Flags().StringVar(&createVMOpts.Arch, "arch", "", + "architecture override for VM [amd64, arm64, fips]; N.B. fips implies amd64 with openssl") + // N.B. We set "usage=roachprod" as the default, custom label for billing tracking. createCmd.Flags().StringToStringVar(&createVMOpts.CustomLabels, "label", map[string]string{"usage": "roachprod"}, diff --git a/pkg/cmd/roachprod/main.go b/pkg/cmd/roachprod/main.go index daa00bea51f4..d0e9cd050575 100644 --- a/pkg/cmd/roachprod/main.go +++ b/pkg/cmd/roachprod/main.go @@ -277,6 +277,14 @@ hosts file. return err } } else { + machineType := func(clusterVMs vm.List) string { + res := clusterVMs[0].MachineType + // Display CPU architecture, other than amd64 (default). + if arch := clusterVMs[0].Labels["arch"]; arch != "" && arch != string(vm.ArchAMD64) { + res += fmt.Sprintf(" [%s]", arch) + } + return res + } // Align columns left and separate with at least two spaces. tw := tabwriter.NewWriter(os.Stdout, 0, 8, 2, ' ', tabwriter.AlignRight) // N.B. colors use escape codes which don't play nice with tabwriter [1]. @@ -304,7 +312,7 @@ hosts file. // N.B. Tabwriter doesn't support per-column alignment. It looks odd to have the cluster names right-aligned, // so we make it left-aligned. fmt.Fprintf(tw, "%s\t%s\t%d\t%s", name+strings.Repeat(" ", maxClusterName-len(name)), c.Clouds(), - len(c.VMs), c.VMs[0].MachineType) + len(c.VMs), machineType(c.VMs)) if !c.IsLocal() { colorByCostBucket := func(cost float64) func(string, ...interface{}) string { switch { @@ -1271,14 +1279,14 @@ func validateAndConfigure(cmd *cobra.Command, args []string) { // Validate architecture flag, if set. if archOpt := cmd.Flags().Lookup("arch"); archOpt != nil && archOpt.Changed { - arch := strings.ToLower(archOpt.Value.String()) + arch := vm.CPUArch(strings.ToLower(archOpt.Value.String())) - if arch != "amd64" && arch != "arm64" && arch != "fips" { + if arch != vm.ArchAMD64 && arch != vm.ArchARM64 && arch != vm.ArchFIPS { printErrAndExit(fmt.Errorf("unsupported architecture %q", arch)) } - if arch != archOpt.Value.String() { + if string(arch) != archOpt.Value.String() { // Set the canonical value. - _ = cmd.Flags().Set("arch", arch) + _ = cmd.Flags().Set("arch", string(arch)) } } } diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index d41503580e2b..05ec8688b3bc 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -59,13 +59,20 @@ func init() { } var ( - // TODO(tbg): this is redundant with --cloud==local. Make the --local flag an - // alias for `--cloud=local` and remove this variable. - local bool - - cockroach string - cockroachShort string - libraryFilePaths []string + // user-specified path to crdb binary + cockroachPath string + // maps cpuArch to the corresponding crdb binary's absolute path + cockroach = make(map[vm.CPUArch]string) + // user-specified path to short crdb binary + cockroachShortPath string + // maps cpuArch to the corresponding short crdb (i.e., without UI) binary's absolute path + cockroachShort = make(map[vm.CPUArch]string) + // user-specified path to workload binary + workloadPath string + // maps cpuArch to the corresponding workload binary's absolute path + workload = make(map[vm.CPUArch]string) + // maps cpuArch to the corresponding dynamically-linked libraries' absolute paths + libraryFilePaths = make(map[vm.CPUArch][]string) cloud = spec.GCE // encryptionProbability controls when encryption-at-rest is enabled // in a cluster for tests that have opted-in to metamorphic @@ -75,10 +82,18 @@ var ( // encryption enabled by default (probability 1). In order to run // them with encryption disabled (perhaps to reproduce a test // failure), roachtest can be invoked with --metamorphic-encryption-probability=0 - encryptionProbability float64 + encryptionProbability float64 + // Total probability with which new ARM64 clusters are provisioned, modulo test specs. which are incompatible. + // N.B. if all selected tests are incompatible with ARM64, then arm64Probability is effectively 0. + // In other words, ClusterSpec.Arch takes precedence over the arm64Probability flag. + arm64Probability float64 + // Conditional probability with which new FIPS clusters are provisioned, modulo test specs. The total probability + // is the product of this and 1-arm64Probability. + // As in the case of arm64Probability, ClusterSpec.Arch takes precedence over the fipsProbability flag. + fipsProbability float64 + instanceType string localSSDArg bool - workload string deprecatedRoachprodBinary string // overrideOpts contains vm.CreateOpts override values passed from the cli. overrideOpts vm.CreateOpts @@ -98,6 +113,8 @@ var ( const ( defaultEncryptionProbability = 1 + defaultFIPSProbability = 0 + defaultARM64Probability = 0 defaultCockroachPath = "./cockroach-default" ) @@ -109,29 +126,62 @@ func (e errBinaryOrLibraryNotFound) Error() string { return fmt.Sprintf("binary or library %q not found (or was not executable)", e.binary) } -func filepathAbs(path string) (string, error) { - path, err := filepath.Abs(path) +func validateBinaryFormat(path string, arch vm.CPUArch, checkEA bool) (string, error) { + abspath, err := filepath.Abs(path) if err != nil { return "", errors.WithStack(err) } - return path, nil -} - -func findBinary(binary, defValue string) (abspath string, err error) { - if binary == "" { - binary = defValue + // Check that the binary ELF format matches the expected architecture. + cmd := exec.Command("file", "-b", abspath) + var out bytes.Buffer + cmd.Stdout = &out + if err := cmd.Run(); err != nil { + return "", errors.Wrapf(err, "error executing 'file %s'", abspath) + } + fileFormat := strings.ToLower(out.String()) + if arch == vm.ArchARM64 { + if !strings.Contains(fileFormat, "arm64") && + !strings.Contains(fileFormat, "aarch64") { + return "", errors.Newf("%s has incompatible architecture; want: %q, got: %q", abspath, arch, fileFormat) + } + } else if arch == vm.ArchAMD64 && + // N.B. the "x86_64" string is returned on macOS, while "x86-64" is returned on Linux. + !(strings.Contains(fileFormat, "x86-64") || strings.Contains(fileFormat, "x86_64")) { + // Otherwise, we expect a binary that was built for amd64. + return "", errors.Newf("%s has incompatible architecture; want: %q, got: %q", abspath, arch, fileFormat) + } + if arch == vm.ArchFIPS && strings.HasSuffix(abspath, "cockroach") { + // Check that the binary is patched to use OpenSSL FIPS. + // N.B. only the cockroach binary is patched, so we exclude this check for dynamically-linked libraries. + cmd = exec.Command("bash", "-c", fmt.Sprintf("nm %s | grep golang-fips |head -1", abspath)) + if err := cmd.Run(); err != nil { + return "", errors.Newf("%s is not compiled with FIPS", abspath) + } } + if checkEA { + // Check that the binary was compiled with assertions _enabled_. + cmd = exec.Command("bash", "-c", fmt.Sprintf("%s version |grep \"Enabled Assertions\" |grep true", abspath)) + if err := cmd.Run(); err != nil { + return "", errors.Newf("%s is not compiled with assertions enabled", abspath) + } + } + + return abspath, nil +} +func findBinary( + name string, osName string, arch vm.CPUArch, checkEA bool, +) (abspath string, err error) { // Check to see if binary exists and is a regular file and executable. - if fi, err := os.Stat(binary); err == nil && fi.Mode().IsRegular() && (fi.Mode()&0111) != 0 { - return filepathAbs(binary) + if fi, err := os.Stat(name); err == nil && fi.Mode().IsRegular() && (fi.Mode()&0111) != 0 { + return validateBinaryFormat(name, arch, checkEA) } - return findBinaryOrLibrary("bin", binary) + return findBinaryOrLibrary("bin", name, "", osName, arch, checkEA) } -func findLibrary(libraryName string) (string, error) { +func findLibrary(libraryName string, os string, arch vm.CPUArch) (string, error) { suffix := ".so" - if local { + if cloud == spec.Local { switch runtime.GOOS { case "linux": case "freebsd": @@ -145,65 +195,102 @@ func findLibrary(libraryName string) (string, error) { return "", errors.Newf("failed to find suffix for runtime %s", runtime.GOOS) } } - return findBinaryOrLibrary("lib", libraryName+suffix) + + return findBinaryOrLibrary("lib", libraryName, suffix, os, arch, false) } -func findBinaryOrLibrary(binOrLib string, name string) (string, error) { +// findBinaryOrLibrary searches for a binary or library, _first_ in the $PATH, _then_ in the following hardcoded paths, +// +// $GOPATH/src/github.com/cockroachdb/cockroach/ +// $GOPATH/src/github.com/cockroachdb/artifacts/ +// $PWD/binOrLib +// $GOPATH/src/github.com/cockroachdb/cockroach/binOrLib +// +// in the above order, unless 'name' is an absolute path, in which case the hardcoded paths are skipped. +// +// binOrLib is either 'bin' or 'lib'; nameSuffix is either empty, '.so', '.dll', or '.dylib'. +// Both osName and arch are used to derive a fully qualified binary or library name by inserting the +// corresponding arch suffix (see install.ArchInfoForOS), e.g. '.linux-arm64' or '.darwin-amd64'. +// That is, each hardcoded path is searched for a file named 'name' or 'name.nameSuffix.archSuffix', respectively. +// +// If no binary or library is found, an error is returned. +// Otherwise, if multiple binaries or libraries are located at the above paths, the first one found is returned. +// If the found binary or library happens to be of the wrong type, e.g., architecture is different from 'arch', or +// checkEA is true, and the binary was not compiled with runtime assertions enabled, an error is returned. +// While we could continue the search instead of returning an error, it is assumed the user can stage the binaries +// to avoid such ambiguity. Alternatively, the user can specify the absolute path to the binary or library, +// e.g., via --cockroach; in this case, only the absolute path is checked and validated. +func findBinaryOrLibrary( + binOrLib string, name string, nameSuffix string, osName string, arch vm.CPUArch, checkEA bool, +) (string, error) { // Find the binary to run and translate it to an absolute path. First, look // for the binary in PATH. - path, err := exec.LookPath(name) + pathFromEnv, err := exec.LookPath(name) + if err == nil { + // Found it in PATH, validate and return absolute path. + return validateBinaryFormat(pathFromEnv, arch, checkEA) + } + if strings.HasPrefix(name, "/") { + // Specified name is an absolute path, but we couldn't find it; bail out. + return "", errors.WithStack(err) + } + // We're unable to find the name in PATH and "name" is a relative path: + // look in the cockroach repo. + gopath := os.Getenv("GOPATH") + if gopath == "" { + gopath = filepath.Join(os.Getenv("HOME"), "go") + } + + dirs := []string{ + filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach/"), + filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach/artifacts/"), + filepath.Join(os.ExpandEnv("$PWD"), binOrLib), + filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach", binOrLib), + } + + archInfo, err := install.ArchInfoForOS(osName, arch) if err != nil { - if strings.HasPrefix(name, "/") { - return "", errors.WithStack(err) - } - - // We're unable to find the name in PATH and "name" is a relative path: - // look in the cockroach repo. - gopath := os.Getenv("GOPATH") - if gopath == "" { - gopath = filepath.Join(os.Getenv("HOME"), "go") - } - - var suffix string - if !local { - suffix = ".docker_amd64" - } - dirs := []string{ - filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach/"), - filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach/artifacts/"), - filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach", binOrLib+suffix), - filepath.Join(os.ExpandEnv("$PWD"), binOrLib+suffix), - filepath.Join(gopath, "/src/github.com/cockroachdb/cockroach", binOrLib), - } - for _, dir := range dirs { - path = filepath.Join(dir, name) - var err2 error - path, err2 = exec.LookPath(path) - if err2 == nil { - return filepathAbs(path) + return "", err + } + archSuffixes := []string{"." + archInfo.DebugArchitecture, "." + archInfo.ReleaseArchitecture} + + for _, dir := range dirs { + var path string + + if path, err = exec.LookPath(filepath.Join(dir, name)); err == nil { + return validateBinaryFormat(path, arch, checkEA) + } + for _, archSuffix := range archSuffixes { + if path, err = exec.LookPath(filepath.Join(dir, name+archSuffix+nameSuffix)); err == nil { + return validateBinaryFormat(path, arch, checkEA) } } - return "", errBinaryOrLibraryNotFound{name} } - return filepathAbs(path) + return "", errBinaryOrLibraryNotFound{name} } // VerifyLibraries verifies that the required libraries, specified by name, are // available for the target environment. -func VerifyLibraries(requiredLibs []string) error { +func VerifyLibraries(requiredLibs []string, arch vm.CPUArch) error { + foundLibraryPaths := libraryFilePaths[arch] + for _, requiredLib := range requiredLibs { - if !contains(libraryFilePaths, libraryNameFromPath, requiredLib) { - return errors.Wrap(errors.Errorf("missing required library %s", requiredLib), "cluster.VerifyLibraries") + if !contains(foundLibraryPaths, libraryNameFromPath, requiredLib) { + return errors.Wrap(errors.Errorf("missing required library %s (arch=%q)", requiredLib, arch), "cluster.VerifyLibraries") } } return nil } -// libraryNameFromPath returns the name of a library without the extension, for a +// libraryNameFromPath returns the name of a library without the extension(s), for a // given path. func libraryNameFromPath(path string) string { filename := filepath.Base(path) - return strings.TrimSuffix(filename, filepath.Ext(filename)) + // N.B. filename may contain multiple extensions, e.g. "libgeos.linux-amd64.fips.so". + for ext := filepath.Ext(filename); ext != ""; ext = filepath.Ext(filename) { + filename = strings.TrimSuffix(filename, ext) + } + return filename } func contains(list []string, transformString func(s string) string, str string) bool { @@ -219,50 +306,128 @@ func contains(list []string, transformString func(s string) string, str string) } func initBinariesAndLibraries() { - // If we're running against an existing "local" cluster, force the local flag - // to true in order to get the "local" test configurations. - if clusterName == "local" { - local = true - } - if local { - cloud = spec.Local - } + // TODO(srosenberg): enable metamorphic local clusters; currently, spec.Local means run all tests locally. + // This could be revisited after we have a way to specify which clouds a given test supports, + // see https://github.com/cockroachdb/cockroach/issues/104029. + defaultOsName := "linux" + defaultArch := vm.ArchAMD64 + if cloud == spec.Local { + defaultOsName = runtime.GOOS + if arm64Probability == 1 { + // N.B. if arm64Probability != 1, then we're running a local cluster with both arm64 and amd64. + defaultArch = vm.ArchARM64 + } + if string(defaultArch) != runtime.GOARCH { + fmt.Printf("WARN: local cluster's architecture (%q) differs from default (%q)\n", runtime.GOARCH, defaultArch) + } + } + fmt.Printf("Locating and verifying binaries for os=%q, arch=%q\n", defaultOsName, defaultArch) + + // Finds and validates a binary. If the binary 'isRequired', but not found, exit and print the error. + resolveBinary := func(binName string, userSpecified string, arch vm.CPUArch, isRequired bool, checkEA bool) (string, error) { + path := binName + if userSpecified != "" { + path = userSpecified + } + abspath, err := findBinary(path, defaultOsName, arch, checkEA) + if err != nil { + if isRequired { + fmt.Fprintf(os.Stderr, "ERROR: unable to find required binary %q for %q: %v\n", binName, arch, err) + os.Exit(1) + } + return "", err + } + if userSpecified == "" { + // No user-specified path, so return the found absolute path. + return abspath, nil + } + // Bail out if a path other than the user-specified was found. + userPath, err := filepath.Abs(userSpecified) + + if err != nil || userPath != abspath { + err = errors.Wrapf(err, "ERROR: found %q at: %s instead of the user-specified path: %q\n", binName, abspath, userSpecified) - cockroachDefault := "cockroach" - if !local { - cockroachDefault = "cockroach-linux-2.6.32-gnu-amd64" + if isRequired { + fmt.Fprintf(os.Stderr, "%v", err) + os.Exit(1) + } + return "", err + } + return abspath, nil } + // We need to verify we have at least both the cockroach and the workload binaries. var err error - cockroach, err = findBinary(cockroach, cockroachDefault) + + cockroach[defaultArch], _ = resolveBinary("cockroach", cockroachPath, defaultArch, true, false) + workload[defaultArch], _ = resolveBinary("workload", workloadPath, defaultArch, true, false) + cockroachShort[defaultArch], err = resolveBinary("cockroach-short", cockroachShortPath, defaultArch, false, true) if err != nil { - fmt.Fprintf(os.Stderr, "%+v\n", err) - os.Exit(1) + fmt.Fprintf(os.Stderr, "WARN: unable to find %q for %q: %s\n", "cockroach-short", defaultArch, err) } - if cockroachShort != "" { - // defValue doesn't matter since cockroachShort is a non-empty string. - cockroachShort, err = findBinary(cockroachShort, "" /* defValue */) + if arm64Probability > 0 && defaultArch != vm.ArchARM64 { + fmt.Printf("Locating and verifying binaries for os=%q, arch=%q\n", defaultOsName, vm.ArchARM64) + // We need to verify we have all the required binaries for arm64. + cockroach[vm.ArchARM64], _ = resolveBinary("cockroach", cockroachPath, vm.ArchARM64, true, false) + workload[vm.ArchARM64], _ = resolveBinary("workload", workloadPath, vm.ArchARM64, true, false) + cockroachShort[vm.ArchARM64], err = resolveBinary("cockroach-short", cockroachShortPath, vm.ArchARM64, false, true) if err != nil { - fmt.Fprintf(os.Stderr, "%+v\n", err) - os.Exit(1) + fmt.Fprintf(os.Stderr, "WARN: unable to find %q for %q: %s\n", "cockroach-short", vm.ArchARM64, err) } } - - workload, err = findBinary(workload, "workload") - if errors.As(err, &errBinaryOrLibraryNotFound{}) { - fmt.Fprintln(os.Stderr, "workload binary not provided, proceeding anyway") - } else if err != nil { - fmt.Fprintf(os.Stderr, "%+v\n", err) - os.Exit(1) + if fipsProbability > 0 && defaultArch != vm.ArchFIPS { + fmt.Printf("Locating and verifying binaries for os=%q, arch=%q\n", defaultOsName, vm.ArchFIPS) + // We need to verify we have all the required binaries for fips. + cockroach[vm.ArchFIPS], _ = resolveBinary("cockroach", cockroachPath, vm.ArchFIPS, true, false) + workload[vm.ArchFIPS], _ = resolveBinary("workload", workloadPath, vm.ArchFIPS, true, false) + cockroachShort[vm.ArchFIPS], err = resolveBinary("cockroach-short", cockroachShortPath, vm.ArchFIPS, false, true) + if err != nil { + fmt.Fprintf(os.Stderr, "WARN: unable to find %q for %q: %s\n", "cockroach-short", vm.ArchFIPS, err) + } } // In v20.2 or higher, optionally expect certain library files to exist. // Since they may not be found in older versions, do not hard error if they are not found. - for _, libraryName := range []string{"libgeos", "libgeos_c"} { - if libraryFilePath, err := findLibrary(libraryName); err != nil { - fmt.Fprintf(os.Stderr, "error finding library %s, ignoring: %+v\n", libraryName, err) - } else { - libraryFilePaths = append(libraryFilePaths, libraryFilePath) + for _, arch := range []vm.CPUArch{vm.ArchAMD64, vm.ArchARM64, vm.ArchFIPS} { + if arm64Probability == 0 && defaultArch != vm.ArchARM64 && arch == vm.ArchARM64 { + // arm64 isn't used, skip finding libs for it. + continue + } + if fipsProbability == 0 && arch == vm.ArchFIPS { + // fips isn't used, skip finding libs for it. + continue + } + paths := []string(nil) + + for _, libraryName := range []string{"libgeos", "libgeos_c"} { + if libraryFilePath, err := findLibrary(libraryName, defaultOsName, arch); err != nil { + fmt.Fprintf(os.Stderr, "WARN: unable to find library %s, ignoring: %s\n", libraryName, err) + } else { + paths = append(paths, libraryFilePath) + } + } + libraryFilePaths[arch] = paths + } + // Looks like we have all the binaries we'll need. Let's print them out. + fmt.Printf("\nFound the following binaries:\n") + for arch, path := range cockroach { + if path != "" { + fmt.Printf("\tcockroach %q at: %s\n", arch, path) + } + } + for arch, path := range workload { + if path != "" { + fmt.Printf("\tworkload %q at: %s\n", arch, path) + } + } + for arch, path := range cockroachShort { + if path != "" { + fmt.Printf("\tcockroach-short %q at: %s\n", arch, path) + } + } + for arch, paths := range libraryFilePaths { + if len(paths) > 0 { + fmt.Printf("\tlibraries %q at: %s\n", arch, strings.Join(paths, ", ")) } } } @@ -666,6 +831,8 @@ type clusterImpl struct { // clusterSettings are additional cluster settings set on cluster startup. clusterSettings map[string]string + os string // OS of the cluster + arch vm.CPUArch // CPU architecture of the cluster // destroyState contains state related to the cluster's destruction. destroyState destroyState } @@ -749,7 +916,10 @@ type clusterConfig struct { localCluster bool useIOBarrier bool alloc *quotapool.IntAlloc - enableFIPS bool + // Specifies CPU architecture which may require a custom AMI and cockroach binary. + arch vm.CPUArch + // Specifies the OS which may require a custom AMI and cockroach binary. + os string } // clusterFactory is a creator of clusters. @@ -886,7 +1056,7 @@ func (f *clusterFactory) newCluster( providerOptsContainer := vm.CreateProviderOptionsContainer() // The ClusterName is set below in the retry loop to ensure // that each create attempt gets a unique cluster name. - createVMOpts, providerOpts, err := cfg.spec.RoachprodOpts("", cfg.useIOBarrier, cfg.enableFIPS) + createVMOpts, providerOpts, err := cfg.spec.RoachprodOpts("", cfg.useIOBarrier, cfg.arch) if err != nil { // We must release the allocation because cluster creation is not possible at this point. cfg.alloc.Release() @@ -922,6 +1092,8 @@ func (f *clusterFactory) newCluster( spec: cfg.spec, expiration: cfg.spec.Expiration(), r: f.r, + arch: cfg.arch, + os: cfg.os, destroyState: destroyState{ owned: true, alloc: cfg.alloc, @@ -1775,11 +1947,13 @@ func (c *clusterImpl) PutLibraries( if err := c.RunE(ctx, c.All(), "mkdir", "-p", libraryDir); err != nil { return err } - for _, libraryFilePath := range libraryFilePaths { - if !contains(libraries, nil, libraryNameFromPath(libraryFilePath)) { + + for _, libraryFilePath := range libraryFilePaths[c.arch] { + libName := libraryNameFromPath(libraryFilePath) + if !contains(libraries, nil, libName) { continue } - putPath := filepath.Join(libraryDir, filepath.Base(libraryFilePath)) + putPath := filepath.Join(libraryDir, libName) if err := c.PutE( ctx, c.l, @@ -1805,7 +1979,7 @@ func (c *clusterImpl) Stage( c.status("staging binary") defer c.status("") return errors.Wrap(roachprod.Stage(ctx, l, c.MakeNodes(opts...), - "" /* stageOS */, "" /* stageArch */, dir, application, versionOrSHA), "cluster.Stage") + c.os, string(c.arch), dir, application, versionOrSHA), "cluster.Stage") } // Get gets files from remote hosts. @@ -2523,6 +2697,10 @@ func (c *clusterImpl) IsSecure() bool { return c.localCertsDir != "" } +func (c *clusterImpl) Architecture() vm.CPUArch { + return c.arch +} + // Extend extends the cluster's expiration by d. func (c *clusterImpl) Extend(ctx context.Context, d time.Duration, l *logger.Logger) error { if ctx.Err() != nil { diff --git a/pkg/cmd/roachtest/cluster/cluster_interface.go b/pkg/cmd/roachtest/cluster/cluster_interface.go index b4e85fe2ab2b..fd2547f70bf0 100644 --- a/pkg/cmd/roachtest/cluster/cluster_interface.go +++ b/pkg/cmd/roachtest/cluster/cluster_interface.go @@ -108,7 +108,10 @@ type Cluster interface { Spec() spec.ClusterSpec Name() string IsLocal() bool + // IsSecure returns true iff the cluster uses TLS. IsSecure() bool + // Returns CPU architecture of the nodes. + Architecture() vm.CPUArch // Deleting CockroachDB data and logs on nodes. diff --git a/pkg/cmd/roachtest/cluster_test.go b/pkg/cmd/roachtest/cluster_test.go index 654c4ece1d0a..a68bca5448e3 100644 --- a/pkg/cmd/roachtest/cluster_test.go +++ b/pkg/cmd/roachtest/cluster_test.go @@ -18,6 +18,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" test2 "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/cockroach/pkg/util/version" "github.com/cockroachdb/errors" "github.com/stretchr/testify/assert" @@ -207,14 +208,14 @@ func TestVerifyLibraries(t *testing.T) { name: "no match", verifyLibs: []string{"required_c"}, libraryFilePaths: []string{"/some/path/lib.so"}, - expectedError: errors.Wrap(errors.Errorf("missing required library %s", + expectedError: errors.Wrap(errors.Errorf("missing required library %s (arch=\"amd64\")", "required_c"), "cluster.VerifyLibraries"), }, { name: "no match on nil libs", verifyLibs: []string{"required_b"}, libraryFilePaths: nil, - expectedError: errors.Wrap(errors.Errorf("missing required library %s", + expectedError: errors.Wrap(errors.Errorf("missing required library %s (arch=\"amd64\")", "required_b"), "cluster.VerifyLibraries"), }, { @@ -223,17 +224,29 @@ func TestVerifyLibraries(t *testing.T) { libraryFilePaths: []string{"/lib/geos.so"}, expectedError: nil, }, + { + name: "single match, multiple extensions", + verifyLibs: []string{"geos"}, + libraryFilePaths: []string{"/lib/geos.linux-amd.so"}, + expectedError: nil, + }, { name: "multiple matches", verifyLibs: []string{"lib", "ltwo", "geos"}, libraryFilePaths: []string{"ltwo.so", "a/geos.so", "/some/path/to/lib.so"}, expectedError: nil, }, + { + name: "multiple matches, multiple extensions", + verifyLibs: []string{"lib", "ltwo", "geos"}, + libraryFilePaths: []string{"ltwo.linux-arm64.so", "a/geos.linux-amd64.fips.so", "/some/path/to/lib.darwin-arm64.so"}, + expectedError: nil, + }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { - libraryFilePaths = tc.libraryFilePaths - actualError := VerifyLibraries(tc.verifyLibs) + libraryFilePaths = map[vm.CPUArch][]string{vm.ArchAMD64: tc.libraryFilePaths} + actualError := VerifyLibraries(tc.verifyLibs, vm.ArchAMD64) if tc.expectedError == nil { require.NoError(t, actualError) } else { diff --git a/pkg/cmd/roachtest/github.go b/pkg/cmd/roachtest/github.go index f12806dcc1b4..cb1f96d573a8 100644 --- a/pkg/cmd/roachtest/github.go +++ b/pkg/cmd/roachtest/github.go @@ -165,7 +165,10 @@ func (g *githubIssues) createPostRequest( roachtestPrefix("cpu"): fmt.Sprintf("%d", spec.Cluster.CPUs), roachtestPrefix("ssd"): fmt.Sprintf("%d", spec.Cluster.SSDs), } - + // Emit CPU architecture only if it was specified; otherwise, it's captured below, assuming cluster was created. + if spec.Cluster.Arch != "" { + clusterParams[roachtestPrefix("arch")] = string(spec.Cluster.Arch) + } // These params can be probabilistically set, so we pass them here to // show what their actual values are in the posted issue. if g.vmCreateOpts != nil { @@ -175,6 +178,11 @@ func (g *githubIssues) createPostRequest( if g.cluster != nil { clusterParams[roachtestPrefix("encrypted")] = fmt.Sprintf("%v", g.cluster.encAtRest) + if spec.Cluster.Arch == "" { + // N.B. when Arch is specified, it cannot differ from cluster's arch. + // Hence, we only emit when arch was unspecified. + clusterParams[roachtestPrefix("arch")] = string(g.cluster.arch) + } } issueMessage := messagePrefix + message diff --git a/pkg/cmd/roachtest/github_test.go b/pkg/cmd/roachtest/github_test.go index 6d62abb3232d..a891b0204a2f 100644 --- a/pkg/cmd/roachtest/github_test.go +++ b/pkg/cmd/roachtest/github_test.go @@ -106,28 +106,31 @@ func TestCreatePostRequest(t *testing.T) { clusterCreationFailed bool loadTeamsFailed bool localSSD bool + arch vm.CPUArch category issueCategory expectedPost bool expectedReleaseBlocker bool expectedParams map[string]string }{ - {true, false, false, false, otherErr, true, false, + {true, false, false, false, "", otherErr, true, false, prefixAll(map[string]string{ "cloud": "gce", "encrypted": "false", "fs": "ext4", "ssd": "0", "cpu": "4", + "arch": "amd64", "localSSD": "false", }), }, - {true, false, false, true, clusterCreationErr, true, false, + {true, false, false, true, vm.ArchARM64, clusterCreationErr, true, false, prefixAll(map[string]string{ "cloud": "gce", "encrypted": "false", "fs": "ext4", "ssd": "0", "cpu": "4", + "arch": "arm64", "localSSD": "true", }), }, @@ -135,7 +138,7 @@ func TestCreatePostRequest(t *testing.T) { // !nonReleaseBlocker and issue is an SSH flake. Also ensure that // in the event of a failed cluster creation, nil `vmOptions` and // `clusterImpl` are not dereferenced - {false, true, false, false, sshErr, true, false, + {false, true, false, false, "", sshErr, true, false, prefixAll(map[string]string{ "cloud": "gce", "ssd": "0", @@ -143,12 +146,12 @@ func TestCreatePostRequest(t *testing.T) { }), }, //Simulate failure loading TEAMS.yaml - {true, false, true, false, otherErr, false, false, nil}, + {true, false, true, false, "", otherErr, false, false, nil}, } reg := makeTestRegistry(spec.GCE, "", "", false) for _, c := range testCases { - clusterSpec := reg.MakeClusterSpec(1) + clusterSpec := reg.MakeClusterSpec(1, spec.Arch(c.arch)) testSpec := ®istry.TestSpec{ Name: "github_test", @@ -162,7 +165,7 @@ func TestCreatePostRequest(t *testing.T) { l: nilLogger(), } - testClusterImpl := &clusterImpl{spec: clusterSpec} + testClusterImpl := &clusterImpl{spec: clusterSpec, arch: vm.ArchAMD64} vo := vm.DefaultCreateOpts() vmOpts := &vo diff --git a/pkg/cmd/roachtest/main.go b/pkg/cmd/roachtest/main.go index 88b0e0aed96b..9ca04f7d5df0 100644 --- a/pkg/cmd/roachtest/main.go +++ b/pkg/cmd/roachtest/main.go @@ -19,10 +19,12 @@ import ( "os/signal" "os/user" "path/filepath" + "runtime" "time" "github.com/cockroachdb/cockroach/pkg/build" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/tests" "github.com/cockroachdb/cockroach/pkg/roachprod" "github.com/cockroachdb/cockroach/pkg/roachprod/config" @@ -94,7 +96,6 @@ func main() { var clusterID string var count = 1 var versionsBinaryOverride map[string]string - var enableFIPS bool cobra.EnableCommandSorting = false @@ -119,16 +120,50 @@ func main() { if cmd.Name() == "help" { return nil } - - if clusterName != "" && local { - return fmt.Errorf( - "cannot specify both an existing cluster (%s) and --local. However, if a local cluster "+ - "already exists, --clusters=local will use it", - clusterName) + local := cmd.Flags().Lookup("local").Value.String() == "true" + if local { + if clusterName != "" { + return fmt.Errorf( + "cannot specify both an existing cluster (%s) and --local. However, if a local cluster "+ + "already exists, --clusters=local will use it", + clusterName) + } + cloud = spec.Local } switch cmd.Name() { case "run", "bench", "store-gen": + if !(0 <= arm64Probability && arm64Probability <= 1) { + return fmt.Errorf("'metamorphic-arm64-probability' must be in [0,1]") + } + if !(0 <= fipsProbability && fipsProbability <= 1) { + return fmt.Errorf("'metamorphic-fips-probability' must be in [0,1]") + } + if arm64Probability == 1 && fipsProbability != 0 { + return fmt.Errorf("'metamorphic-fips-probability' must be 0 when 'metamorphic-arm64-probability' is 1") + } + if fipsProbability == 1 && arm64Probability != 0 { + return fmt.Errorf("'metamorphic-arm64-probability' must be 0 when 'metamorphic-fips-probability' is 1") + } + arm64Opt := cmd.Flags().Lookup("metamorphic-arm64-probability") + if !arm64Opt.Changed && runtime.GOARCH == "arm64" && cloud == spec.Local { + fmt.Printf("Detected 'arm64' in 'local mode', setting 'metamorphic-arm64-probability' to 1; use --metamorphic-arm64-probability to run (emulated) with other binaries\n") + arm64Probability = 1 + } + // Find and validate all required binaries and libraries. initBinariesAndLibraries() + + if arm64Probability > 0 { + fmt.Printf("ARM64 clusters will be provisioned with probability %.2f\n", arm64Probability) + } + amd64Probability := 1 - arm64Probability + if amd64Probability > 0 { + fmt.Printf("AMD64 clusters will be provisioned with probability %.2f\n", amd64Probability) + } + if fipsProbability > 0 { + // N.B. arm64Probability < 1, otherwise fipsProbability == 0, as per above check. + // Hence, amd64Probability > 0 is implied. + fmt.Printf("FIPS clusters will be provisioned with probability %.2f\n", fipsProbability*amd64Probability) + } } return nil }, @@ -140,6 +175,7 @@ func main() { "If fewer than --parallelism names are specified, then the parallelism "+ "is capped to the number of clusters specified. When a cluster does not exist "+ "yet, it is created according to the spec.") + var local bool rootCmd.PersistentFlags().BoolVarP( &local, "local", "l", local, "run tests locally") rootCmd.PersistentFlags().StringVarP( @@ -147,15 +183,25 @@ func main() { "Username to use as a cluster name prefix. "+ "If blank, the current OS user is detected and specified.") rootCmd.PersistentFlags().StringVar( - &cockroach, "cockroach", "", "path to cockroach binary to use") + &cockroachPath, "cockroach", "", "path to cockroach binary to use") rootCmd.PersistentFlags().StringVar( - &cockroachShort, "cockroach-short", "", "path to cockroach-short binary (compiled with crdb_test build tag) to use") + &cockroachShortPath, "cockroach-short", "", "path to cockroach-short binary (compiled with crdb_test build tag) to use") rootCmd.PersistentFlags().StringVar( - &workload, "workload", "", "path to workload binary to use") + &workloadPath, "workload", "", "path to workload binary to use") rootCmd.PersistentFlags().Float64Var( &encryptionProbability, "metamorphic-encryption-probability", defaultEncryptionProbability, "probability that clusters will be created with encryption-at-rest enabled "+ "for tests that support metamorphic encryption (default 1.0)") + rootCmd.PersistentFlags().Float64Var( + &fipsProbability, "metamorphic-fips-probability", defaultFIPSProbability, + "conditional probability that amd64 clusters will be created with FIPS, i.e., P(fips | amd64), "+ + "for tests that support FIPS and whose CPU architecture is 'amd64' (default 0) "+ + "NOTE: amd64 clusters are created with probability 1-P(arm64), where P(arm64) is 'metamorphic-arm64-probability'. "+ + "Hence, P(fips | amd64) = P(fips) * (1 - P(arm64))") + rootCmd.PersistentFlags().Float64Var( + &arm64Probability, "metamorphic-arm64-probability", defaultARM64Probability, + "probability that clusters will be created with 'arm64' CPU architecture "+ + "for tests that support 'arm64' (default 0)") rootCmd.AddCommand(&cobra.Command{ Use: `version`, @@ -257,7 +303,6 @@ runner itself. user: username, clusterID: clusterID, versionsBinaryOverride: versionsBinaryOverride, - enableFIPS: enableFIPS, }) }, } @@ -295,7 +340,6 @@ runner itself. user: username, clusterID: clusterID, versionsBinaryOverride: versionsBinaryOverride, - enableFIPS: enableFIPS, }) }, } @@ -348,8 +392,6 @@ runner itself. "is present in the list,"+"the respective binary will be used when a "+ "multi-version test asks for the respective binary, instead of "+ "`roachprod stage `. Example: 20.1.4=cockroach-20.1,20.2.0=cockroach-20.2.") - cmd.Flags().BoolVar( - &enableFIPS, "fips", false, "Run tests in enableFIPS mode") } parseCreateOpts(runCmd.Flags(), &overrideOpts) @@ -401,7 +443,6 @@ type cliCfg struct { user string clusterID string versionsBinaryOverride map[string]string - enableFIPS bool } func runTests(register func(registry.Registry), cfg cliCfg) error { @@ -421,7 +462,7 @@ func runTests(register func(registry.Registry), cfg cliCfg) error { filter := registry.NewTestFilter(cfg.args, cfg.runSkipped) clusterType := roachprodCluster bindTo := "" - if local { + if cloud == spec.Local { clusterType = localCluster // This will suppress the annoying "Allow incoming network connections" popup from @@ -442,7 +483,6 @@ func runTests(register func(registry.Registry), cfg cliCfg) error { cpuQuota: cfg.cpuQuota, debugMode: cfg.debugMode, clusterID: cfg.clusterID, - enableFIPS: cfg.enableFIPS, } if err := runner.runHTTPServer(cfg.httpPort, os.Stdout, bindTo); err != nil { return err diff --git a/pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go b/pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go index 4a34fddc7079..880d9fb43ce7 100644 --- a/pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go +++ b/pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go @@ -258,7 +258,7 @@ func NewTest( t.Fatal(err) } - prng, seed := randutil.NewPseudoRand() + prng, seed := randutil.NewLockedPseudoRand() testLogger.Printf("mixed-version random seed: %d", seed) testCtx, cancel := context.WithCancel(ctx) diff --git a/pkg/cmd/roachtest/slack.go b/pkg/cmd/roachtest/slack.go index 1e653d142f61..40f2505003e0 100644 --- a/pkg/cmd/roachtest/slack.go +++ b/pkg/cmd/roachtest/slack.go @@ -75,8 +75,6 @@ func postSlackReport(pass, fail, skip map[*testImpl]struct{}) { switch { case cloud != "": prefix = strings.ToUpper(cloud) - case local: - prefix = "LOCAL" default: prefix = "GCE" } diff --git a/pkg/cmd/roachtest/spec/cluster_spec.go b/pkg/cmd/roachtest/spec/cluster_spec.go index 856e0d6eb708..cec686692157 100644 --- a/pkg/cmd/roachtest/spec/cluster_spec.go +++ b/pkg/cmd/roachtest/spec/cluster_spec.go @@ -63,7 +63,8 @@ func (m MemPerCPU) String() string { // look like. It becomes part of a clusterConfig when the cluster is created. type ClusterSpec struct { Cloud string - InstanceType string // auto-chosen if left empty + Arch vm.CPUArch // CPU architecture; auto-chosen if left empty + InstanceType string // auto-chosen if left empty NodeCount int // CPUs is the number of CPUs per node. CPUs int @@ -199,7 +200,7 @@ func getAzureOpts(machineType string, zones []string) vm.ProviderOpts { // RoachprodOpts returns the opts to use when calling `roachprod.Create()` // in order to create the cluster described in the spec. func (s *ClusterSpec) RoachprodOpts( - clusterName string, useIOBarrier bool, enableFIPS bool, + clusterName string, useIOBarrier bool, arch vm.CPUArch, ) (vm.CreateOpts, vm.ProviderOpts, error) { createVMOpts := vm.DefaultCreateOpts() @@ -232,30 +233,41 @@ func (s *ClusterSpec) RoachprodOpts( } createVMOpts.GeoDistributed = s.Geo - createVMOpts.EnableFIPS = enableFIPS + createVMOpts.Arch = string(arch) machineType := s.InstanceType ssdCount := s.SSDs + if s.CPUs != 0 { // Default to the user-supplied machine type, if any. // Otherwise, pick based on requested CPU count. + var selectedArch vm.CPUArch + if len(machineType) == 0 { // If no machine type was specified, choose one // based on the cloud and CPU count. switch s.Cloud { case AWS: - machineType = AWSMachineType(s.CPUs, s.Mem) + machineType, selectedArch = AWSMachineType(s.CPUs, s.Mem, arch) case GCE: - machineType = GCEMachineType(s.CPUs, s.Mem) + machineType, selectedArch = GCEMachineType(s.CPUs, s.Mem, arch) case Azure: machineType = AzureMachineType(s.CPUs, s.Mem) } } + if selectedArch != "" && selectedArch != arch { + // TODO(srosenberg): we need a better way to monitor the rate of this mismatch, i.e., + // other than grepping cluster creation logs. + fmt.Printf("WARN: requested arch %s for machineType %s, but selected %s\n", arch, machineType, selectedArch) + createVMOpts.Arch = string(selectedArch) + } // Local SSD can only be requested // - if configured to prefer doing so, // - if no particular volume size is requested, and, // - on AWS, if the machine type supports it. - if s.PreferLocalSSD && s.VolumeSize == 0 && (s.Cloud != AWS || awsMachineSupportsSSD(machineType)) { + // - on GCE, if the machine type is not ARM64. + if s.PreferLocalSSD && s.VolumeSize == 0 && (s.Cloud != AWS || awsMachineSupportsSSD(machineType)) && + (s.Cloud != GCE || selectedArch != vm.ArchARM64) { // Ensure SSD count is at least 1 if UseLocalSSD is true. if ssdCount == 0 { ssdCount = 1 @@ -288,9 +300,9 @@ func (s *ClusterSpec) RoachprodOpts( } } - if createVMOpts.EnableFIPS && !(s.Cloud == GCE || s.Cloud == AWS) { + if createVMOpts.Arch == string(vm.ArchFIPS) && !(s.Cloud == GCE || s.Cloud == AWS) { return vm.CreateOpts{}, nil, errors.Errorf( - "node creation with enableFIPS enabled not yet supported on %s", s.Cloud, + "FIPS not yet supported on %s", s.Cloud, ) } var providerOpts vm.ProviderOpts diff --git a/pkg/cmd/roachtest/spec/machine_type.go b/pkg/cmd/roachtest/spec/machine_type.go index 98e1a7aba508..084e7f215371 100644 --- a/pkg/cmd/roachtest/spec/machine_type.go +++ b/pkg/cmd/roachtest/spec/machine_type.go @@ -10,16 +10,31 @@ package spec -import "fmt" +import ( + "fmt" + + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" +) // AWSMachineType selects a machine type given the desired number of CPUs and -// memory per CPU ratio. -func AWSMachineType(cpus int, mem MemPerCPU) string { +// memory per CPU ratio. Also returns the architecture of the selected machine type. +func AWSMachineType(cpus int, mem MemPerCPU, arch vm.CPUArch) (string, vm.CPUArch) { // TODO(erikgrinaker): These have significantly less RAM than // their GCE counterparts. Consider harmonizing them. family := "c5d" // 2 GB RAM per CPU + selectedArch := vm.ArchAMD64 + if arch == vm.ArchFIPS { + selectedArch = vm.ArchFIPS + } else if arch == vm.ArchARM64 { + family = "c7g" // 2 GB RAM per CPU (graviton3) + selectedArch = vm.ArchARM64 + } + if mem == High { family = "m5d" // 4 GB RAM per CPU + if arch == vm.ArchARM64 { + family = "m7g" // 4 GB RAM per CPU (graviton3) + } } else if mem == Low { panic("low memory per CPU not available for AWS") } @@ -36,30 +51,46 @@ func AWSMachineType(cpus int, mem MemPerCPU) string { size = "4xlarge" case cpus <= 36: size = "9xlarge" + if family == "c7g" || family == "m7g" { + size = "8xlarge" + } case cpus <= 72: size = "18xlarge" + if family == "c7g" || family == "m7g" { + size = "16xlarge" + } case cpus <= 96: size = "24xlarge" default: panic(fmt.Sprintf("no aws machine type with %d cpus", cpus)) } - // There is no c5d.24xlarge. + // There is no m7g.24xlarge, fall back to m5d.24xlarge. + if family == "m7g" && size == "24xlarge" { + family = "m5d" + selectedArch = vm.ArchAMD64 + } + + // There is no c5d.24xlarge, fall back to m5d.24xlarge. if family == "c5d" && size == "24xlarge" { family = "m5d" } - return fmt.Sprintf("%s.%s", family, size) + return fmt.Sprintf("%s.%s", family, size), selectedArch } // GCEMachineType selects a machine type given the desired number of CPUs and -// memory per CPU ratio. -func GCEMachineType(cpus int, mem MemPerCPU) string { +// memory per CPU ratio. Also returns the architecture of the selected machine type. +func GCEMachineType(cpus int, mem MemPerCPU, arch vm.CPUArch) (string, vm.CPUArch) { // TODO(peter): This is awkward: at or below 16 cpus, use n1-standard so that // the machines have a decent amount of RAM. We could use custom machine // configurations, but the rules for the amount of RAM per CPU need to be // determined (you can't request any arbitrary amount of RAM). series := "n1" + selectedArch := vm.ArchAMD64 + if arch == vm.ArchFIPS { + selectedArch = vm.ArchFIPS + } var kind string switch mem { case Auto: @@ -75,7 +106,12 @@ func GCEMachineType(cpus int, mem MemPerCPU) string { case Low: kind = "highcpu" // 0.9 GB RAM per CPU } - return fmt.Sprintf("%s-%s-%d", series, kind, cpus) + if arch == vm.ArchARM64 && mem == Auto && cpus <= 48 { + series = "t2a" + kind = "standard" + selectedArch = vm.ArchARM64 + } + return fmt.Sprintf("%s-%s-%d", series, kind, cpus), selectedArch } // AzureMachineType selects a machine type given the desired number of CPUs and diff --git a/pkg/cmd/roachtest/spec/option.go b/pkg/cmd/roachtest/spec/option.go index ff59b5bb39ee..146a2b43d503 100644 --- a/pkg/cmd/roachtest/spec/option.go +++ b/pkg/cmd/roachtest/spec/option.go @@ -10,7 +10,11 @@ package spec -import "time" +import ( + "time" + + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" +) // Option is the interface satisfied by options to MakeClusterSpec. type Option interface { @@ -28,6 +32,17 @@ func Cloud(s string) Option { return cloudOption(s) } +type archOption string + +func (o archOption) apply(spec *ClusterSpec) { + spec.Arch = vm.CPUArch(o) +} + +// Request specific CPU architecture. +func Arch(arch vm.CPUArch) Option { + return archOption(arch) +} + type nodeCPUOption int func (o nodeCPUOption) apply(spec *ClusterSpec) { diff --git a/pkg/cmd/roachtest/test_impl.go b/pkg/cmd/roachtest/test_impl.go index 63cab78b582e..adcb5f2e3d9f 100644 --- a/pkg/cmd/roachtest/test_impl.go +++ b/pkg/cmd/roachtest/test_impl.go @@ -123,6 +123,7 @@ func (t *testImpl) BuildVersion() *version.Version { return t.buildVersion } +// Cockroach returns the path to the cockroach binary. func (t *testImpl) Cockroach() string { return t.cockroach } diff --git a/pkg/cmd/roachtest/test_registry_test.go b/pkg/cmd/roachtest/test_registry_test.go index 21002046bfd0..0d05c8c933f0 100644 --- a/pkg/cmd/roachtest/test_registry_test.go +++ b/pkg/cmd/roachtest/test_registry_test.go @@ -15,6 +15,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/require" @@ -42,6 +43,12 @@ func TestMakeTestRegistry(t *testing.T) { require.Equal(t, "foo", s.InstanceType) require.EqualValues(t, 4, s.CPUs) require.True(t, s.TerminateOnMigration) + + s = r.MakeClusterSpec(10, spec.CPU(16), spec.Arch(vm.ArchARM64)) + require.EqualValues(t, 10, s.NodeCount) + require.Equal(t, "foo", s.InstanceType) + require.EqualValues(t, 16, s.CPUs) + require.EqualValues(t, vm.ArchARM64, s.Arch) }) } diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index 7cf8ebb54ee5..eeea467f4c1e 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -65,6 +65,8 @@ var ( // prometheusScrapeInterval should be consistent with the scrape interval defined in // https://grafana.testeng.crdb.io/prometheus/config prometheusScrapeInterval = time.Second * 15 + + prng, _ = randutil.NewLockedPseudoRand() ) // testRunner runs tests. @@ -163,8 +165,7 @@ type clustersOpt struct { cpuQuota int // Controls whether the cluster is cleaned up at the end of the test. - debugMode debugMode - enableFIPS bool + debugMode debugMode } type debugMode int @@ -395,11 +396,12 @@ func defaultClusterAllocator( allocateCluster := func( ctx context.Context, t registry.TestSpec, + arch vm.CPUArch, alloc *quotapool.IntAlloc, artifactsDir string, wStatus *workerStatus, ) (*clusterImpl, *vm.CreateOpts, error) { - wStatus.SetStatus("creating cluster") + wStatus.SetStatus(fmt.Sprintf("creating cluster (arch=%q)", arch)) defer wStatus.SetStatus("") existingClusterName := clustersOpt.clusterName @@ -416,6 +418,9 @@ func defaultClusterAllocator( skipStop: r.config.skipClusterStopOnAttach, skipWipe: r.config.skipClusterWipeOnAttach, } + // TODO(srosenberg): we need to think about validation here. Attaching to an incompatible cluster, e.g., + // using arm64 AMI with amd64 binary, would result in obscure errors. The test runner ensures compatibility + // during cluster reuse, whereas attachment via CLI (e.g., via roachprod) does not. lopt.l.PrintfCtx(ctx, "Attaching to existing cluster %s for test %s", existingClusterName, t.Name) c, err := attachToExistingCluster(ctx, existingClusterName, clusterL, t.Cluster, opt, r.cr) if err == nil { @@ -426,11 +431,11 @@ func defaultClusterAllocator( } // Fall through to create new cluster with name override. lopt.l.PrintfCtx( - ctx, "Creating new cluster with custom name %q for test %s: %s", - clustersOpt.clusterName, t.Name, t.Cluster, + ctx, "Creating new cluster with custom name %q for test %s: %s (arch=%q)", + clustersOpt.clusterName, t.Name, t.Cluster, arch, ) } else { - lopt.l.PrintfCtx(ctx, "Creating new cluster for test %s: %s", t.Name, t.Cluster) + lopt.l.PrintfCtx(ctx, "Creating new cluster for test %s: %s (arch=%q)", t.Name, t.Cluster, arch) } cfg := clusterConfig{ @@ -440,7 +445,7 @@ func defaultClusterAllocator( username: clustersOpt.user, localCluster: clustersOpt.typ == localCluster, alloc: alloc, - enableFIPS: clustersOpt.enableFIPS, + arch: arch, } return clusterFactory.newCluster(ctx, cfg, wStatus.SetStatus, lopt.tee) } @@ -450,6 +455,7 @@ func defaultClusterAllocator( type clusterAllocatorFn func( ctx context.Context, t registry.TestSpec, + arch vm.CPUArch, alloc *quotapool.IntAlloc, artifactsDir string, wStatus *workerStatus, @@ -530,8 +536,6 @@ func (r *testRunner) runWorker( } }() - prng, _ := randutil.NewPseudoRand() - // Loop until there's no more work in the pool, we get interrupted, or an // error occurs. for { @@ -577,7 +581,7 @@ func (r *testRunner) runWorker( // Attempt to reuse existing cluster. if c != nil && testToRun.canReuseCluster { err = func() error { - l.PrintfCtx(ctx, "Using existing cluster: %s. Wiping", c.name) + l.PrintfCtx(ctx, "Using existing cluster: %s (arch=%q). Wiping", c.name, c.arch) if err := c.WipeE(ctx, l); err != nil { return err } @@ -604,10 +608,43 @@ func (r *testRunner) runWorker( // Let's attempt to create a fresh one. testToRun.canReuseCluster = false } + // sanity check + if c.spec.Cloud != spec.Local && c.spec.Arch != "" && c.arch != c.spec.Arch { + return errors.Newf("cluster arch %q does not match specified arch %q on cloud: %q", c.arch, c.spec.Arch, c.spec.Cloud) + } + } + arch := testToRun.spec.Cluster.Arch + // N.B. local cluster can mix different CPU architectures via emulation; e.g., mac silicon running x86. + if testToRun.canReuseCluster && c != nil && c.spec.Cloud != spec.Local { + // We're reusing a non-local cluster, so we must use the same arch. + arch = c.arch + } + if arch == "" { + // CPU architecture is unspecified, choose one according to the probability distribution. + arch = vm.ArchAMD64 + if prng.Float64() < arm64Probability { + arch = vm.ArchARM64 + } else if prng.Float64() < fipsProbability { + // N.B. branch is taken with probability (1 - arm64Probability) * fipsProbability which is P(fips | amd64). + // N.B. FIPS is only supported on 'amd64' at this time. + arch = vm.ArchFIPS + } + l.PrintfCtx(ctx, "Using (randomly) chosen arch=%q for %s", arch, testToRun.spec.Name) + } else { + l.PrintfCtx(ctx, "Using (specified) arch=%q for %s", arch, testToRun.spec.Name) + } + // N.B. if canReuseCluster is false, then the previous cluster has been destroyed; new one will be created below. + if testToRun.canReuseCluster && c != nil && c.arch != arch { + // Non-local cluster that's being reused must have the same architecture as was ensured above. + if c.spec.Cloud != spec.Local { + return errors.New("infeasible path: non-local cluster arch mismatch") + } + // Local cluster is now reused to emulate a different CPU architecture. + c.arch = arch } // Verify that required native libraries are available. - if err = VerifyLibraries(testToRun.spec.NativeLibs); err != nil { + if err = VerifyLibraries(testToRun.spec.NativeLibs, arch); err != nil { shout(ctx, l, stdout, "Library verification failed: %s", err) return err } @@ -619,13 +656,14 @@ func (r *testRunner) runWorker( // Create a new cluster if can't reuse or reuse attempt failed. // N.B. non-reusable cluster would have been destroyed above. wStatus.SetTest(nil /* test */, testToRun) - wStatus.SetStatus("creating cluster") - c, vmCreateOpts, clusterCreateErr = allocateCluster(ctx, testToRun.spec, testToRun.alloc, artifactsRootDir, wStatus) + c, vmCreateOpts, clusterCreateErr = allocateCluster(ctx, testToRun.spec, arch, testToRun.alloc, artifactsRootDir, wStatus) if clusterCreateErr != nil { clusterCreateErr = errors.Mark(clusterCreateErr, errClusterProvisioningFailed) atomic.AddInt32(&r.numClusterErrs, 1) shout(ctx, l, stdout, "Unable to create (or reuse) cluster for test %s due to: %s.", testToRun.spec.Name, clusterCreateErr) + } else { + l.PrintfCtx(ctx, "Created new cluster for test %s: %s (arch=%q)", testToRun.spec.Name, c.Name(), arch) } } // Prepare the test's logger. Always set this up with real files, using a @@ -655,9 +693,9 @@ func (r *testRunner) runWorker( } t := &testImpl{ spec: &testToRun.spec, - cockroach: cockroach, - cockroachShort: cockroachShort, - deprecatedWorkload: workload, + cockroach: cockroach[arch], + cockroachShort: cockroachShort[arch], + deprecatedWorkload: workload[arch], buildVersion: binaryVersion, artifactsDir: artifactsDir, artifactsSpec: artifactsSpec, @@ -666,9 +704,6 @@ func (r *testRunner) runWorker( skipInit: topt.skipInit, debug: debugMode.IsDebug(), } - // Now run the test. - l.PrintfCtx(ctx, "starting test: %s:%d", testToRun.spec.Name, testToRun.runNum) - github := newGithubIssues(r.config.disableIssue, c, vmCreateOpts) if clusterCreateErr != nil { @@ -683,6 +718,9 @@ func (r *testRunner) runWorker( shout(ctx, l, stdout, "failed to post issue: %s", err) } } else { + // Now run the test. + l.PrintfCtx(ctx, "Starting test: %s:%d on cluster=%s (arch=%q)", testToRun.spec.Name, testToRun.runNum, c.Name(), arch) + c.setTest(t) if c.spec.NodeCount > 0 { // skip during tests err = c.PutDefaultCockroach(ctx, l, t.Cockroach()) diff --git a/pkg/cmd/roachtest/test_test.go b/pkg/cmd/roachtest/test_test.go index 04bd1c882b6d..52b4514300a6 100644 --- a/pkg/cmd/roachtest/test_test.go +++ b/pkg/cmd/roachtest/test_test.go @@ -100,6 +100,7 @@ func nilLogger() *logger.Logger { func alwaysFailingClusterAllocator( ctx context.Context, t registry.TestSpec, + arch vm.CPUArch, alloc *quotapool.IntAlloc, artifactsDir string, wStatus *workerStatus, diff --git a/pkg/cmd/roachtest/tests/autoupgrade.go b/pkg/cmd/roachtest/tests/autoupgrade.go index c7bd5583da33..7941b8687df0 100644 --- a/pkg/cmd/roachtest/tests/autoupgrade.go +++ b/pkg/cmd/roachtest/tests/autoupgrade.go @@ -13,7 +13,6 @@ package tests import ( "context" "fmt" - "runtime" "time" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" @@ -257,9 +256,6 @@ func registerAutoUpgrade(r registry.Registry) { Owner: registry.OwnerTestEng, Cluster: r.MakeClusterSpec(5), Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } pred, err := version.PredecessorVersion(*t.BuildVersion()) if err != nil { t.Fatal(err) diff --git a/pkg/cmd/roachtest/tests/cdc.go b/pkg/cmd/roachtest/tests/cdc.go index 274c36ddccba..d16c5e646b56 100644 --- a/pkg/cmd/roachtest/tests/cdc.go +++ b/pkg/cmd/roachtest/tests/cdc.go @@ -25,7 +25,6 @@ import ( "net/url" "path/filepath" "regexp" - "runtime" "sort" "strconv" "strings" @@ -48,6 +47,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" "github.com/cockroachdb/cockroach/pkg/roachprod/prometheus" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/cockroach/pkg/testutils/jobutils" "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" "github.com/cockroachdb/cockroach/pkg/util/protoutil" @@ -583,9 +583,6 @@ type latencyTargets struct { } func runCDCBank(ctx context.Context, t test.Test, c cluster.Cluster) { - if runtime.GOARCH == "arm64" { - t.Skip("Skipping cdc/bank under ARM64.") - } // Make the logs dir on every node to work around the `roachprod get logs` // spam. c.Run(ctx, c.All(), `mkdir -p logs`) @@ -1317,9 +1314,10 @@ func registerCDC(r registry.Registry) { }, }) r.Add(registry.TestSpec{ - Name: "cdc/bank", - Owner: `cdc`, - Cluster: r.MakeClusterSpec(4), + Name: "cdc/bank", + Owner: `cdc`, + // N.B. ARM64 is not yet supported, see https://github.com/cockroachdb/cockroach/issues/103888. + Cluster: r.MakeClusterSpec(4, spec.Arch(vm.ArchAMD64)), Leases: registry.MetamorphicLeases, RequiresLicense: true, Timeout: 30 * time.Minute, diff --git a/pkg/cmd/roachtest/tests/cluster_to_cluster.go b/pkg/cmd/roachtest/tests/cluster_to_cluster.go index fa506b0c08e1..6c7e19e7da84 100644 --- a/pkg/cmd/roachtest/tests/cluster_to_cluster.go +++ b/pkg/cmd/roachtest/tests/cluster_to_cluster.go @@ -497,8 +497,8 @@ func (rd *replicationDriver) preStreamingWorkload(ctx context.Context) { rd.t.Status("populating source cluster before replication") initStart := timeutil.Now() rd.c.Run(ctx, rd.setup.workloadNode, initCmd) - rd.t.L().Printf("src cluster workload initialization took %s minutes", - timeutil.Since(initStart).Minutes()) + rd.t.L().Printf("src cluster workload initialization took %s", + timeutil.Since(initStart)) } } diff --git a/pkg/cmd/roachtest/tests/decommission.go b/pkg/cmd/roachtest/tests/decommission.go index 3e612984dc59..58fc0eeee38b 100644 --- a/pkg/cmd/roachtest/tests/decommission.go +++ b/pkg/cmd/roachtest/tests/decommission.go @@ -17,7 +17,6 @@ import ( "math/rand" "reflect" "regexp" - "runtime" "strconv" "strings" "time" @@ -102,9 +101,6 @@ func registerDecommission(r registry.Registry) { Owner: registry.OwnerKV, Cluster: r.MakeClusterSpec(numNodes), Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } runDecommissionMixedVersions(ctx, t, c, *t.BuildVersion()) }, }) diff --git a/pkg/cmd/roachtest/tests/follower_reads.go b/pkg/cmd/roachtest/tests/follower_reads.go index 6ed247300866..c3393362fce3 100644 --- a/pkg/cmd/roachtest/tests/follower_reads.go +++ b/pkg/cmd/roachtest/tests/follower_reads.go @@ -19,7 +19,6 @@ import ( "net/http" "reflect" "regexp" - "runtime" "strconv" "strings" "time" @@ -103,9 +102,6 @@ func registerFollowerReads(r registry.Registry) { spec.CPU(2), ), Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } runFollowerReadsMixedVersionSingleRegionTest(ctx, t, c, *t.BuildVersion()) }, }) diff --git a/pkg/cmd/roachtest/tests/import.go b/pkg/cmd/roachtest/tests/import.go index 8b07bdbb2509..64c5ba102e33 100644 --- a/pkg/cmd/roachtest/tests/import.go +++ b/pkg/cmd/roachtest/tests/import.go @@ -15,7 +15,6 @@ import ( gosql "database/sql" "fmt" "path/filepath" - "runtime" "strings" "time" @@ -357,9 +356,6 @@ func registerImportMixedVersion(r registry.Registry) { // Mixed-version support was added in 21.1. Cluster: r.MakeClusterSpec(4), Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } predV, err := version.PredecessorVersion(*t.BuildVersion()) if err != nil { t.Fatal(err) diff --git a/pkg/cmd/roachtest/tests/mixed_version_backup.go b/pkg/cmd/roachtest/tests/mixed_version_backup.go index bf8e1c3e706c..54ba4fec1bc8 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_backup.go +++ b/pkg/cmd/roachtest/tests/mixed_version_backup.go @@ -19,7 +19,6 @@ import ( "path/filepath" "reflect" "regexp" - "runtime" "sort" "strings" "sync/atomic" @@ -31,6 +30,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/mixedversion" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/jobs" "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" @@ -2033,8 +2033,8 @@ func registerBackupMixedVersion(r registry.Registry) { EncryptionSupport: registry.EncryptionMetamorphic, RequiresLicense: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") + if c.Spec().Cloud != spec.GCE { + t.Skip("uses gs://cockroachdb-backup-testing, available only in GCE") } roachNodes := c.Range(1, c.Spec().NodeCount-1) diff --git a/pkg/cmd/roachtest/tests/mixed_version_cdc.go b/pkg/cmd/roachtest/tests/mixed_version_cdc.go index bef4ecb237b6..21a7f4e86b9c 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_cdc.go +++ b/pkg/cmd/roachtest/tests/mixed_version_cdc.go @@ -14,7 +14,6 @@ import ( "context" gosql "database/sql" "fmt" - "runtime" "strconv" "strings" "time" @@ -26,6 +25,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/cockroach/pkg/util/randutil" "github.com/cockroachdb/cockroach/pkg/util/syncutil" "github.com/cockroachdb/cockroach/pkg/util/timeutil" @@ -66,15 +66,13 @@ func registerCDCMixedVersions(r registry.Registry) { zones = teamcityAgentZone } r.Add(registry.TestSpec{ - Name: "cdc/mixed-versions", - Owner: registry.OwnerTestEng, - Cluster: r.MakeClusterSpec(5, spec.Zones(zones)), + Name: "cdc/mixed-versions", + Owner: registry.OwnerTestEng, + // N.B. ARM64 is not yet supported, see https://github.com/cockroachdb/cockroach/issues/103888. + Cluster: r.MakeClusterSpec(5, spec.Zones(zones), spec.Arch(vm.ArchAMD64)), Timeout: timeout, RequiresLicense: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } runCDCMixedVersions(ctx, t, c, *t.BuildVersion()) }, }) diff --git a/pkg/cmd/roachtest/tests/mixed_version_decl_schemachange_compat.go b/pkg/cmd/roachtest/tests/mixed_version_decl_schemachange_compat.go index 6161bb99d67a..9d791b11d53f 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_decl_schemachange_compat.go +++ b/pkg/cmd/roachtest/tests/mixed_version_decl_schemachange_compat.go @@ -16,11 +16,11 @@ import ( "os" "path/filepath" "regexp" - "runtime" "strings" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/util/version" ) @@ -31,8 +31,8 @@ func registerDeclSchemaChangeCompatMixedVersions(r registry.Registry) { Owner: registry.OwnerSQLFoundations, Cluster: r.MakeClusterSpec(1), Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") + if c.Spec().Cloud != spec.GCE { + t.Skip("uses gsutil with gs://cockroach-corpus, available only in GCE") } runDeclSchemaChangeCompatMixedVersions(ctx, t, c, *t.BuildVersion()) }, diff --git a/pkg/cmd/roachtest/tests/mixed_version_job_compatibility_in_declarative_schema_changer.go b/pkg/cmd/roachtest/tests/mixed_version_job_compatibility_in_declarative_schema_changer.go index 8eeeac3c93df..87343d58f523 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_job_compatibility_in_declarative_schema_changer.go +++ b/pkg/cmd/roachtest/tests/mixed_version_job_compatibility_in_declarative_schema_changer.go @@ -12,7 +12,6 @@ package tests import ( "context" - "runtime" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" @@ -133,9 +132,6 @@ func registerDeclarativeSchemaChangerJobCompatibilityInMixedVersion(r registry.R Owner: registry.OwnerSQLFoundations, Cluster: r.MakeClusterSpec(4), Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } predV, err := version.PredecessorVersion(*t.BuildVersion()) require.NoError(t, err) diff --git a/pkg/cmd/roachtest/tests/mixed_version_jobs.go b/pkg/cmd/roachtest/tests/mixed_version_jobs.go index ed805f4aeef9..33019e8b6ce3 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_jobs.go +++ b/pkg/cmd/roachtest/tests/mixed_version_jobs.go @@ -13,7 +13,6 @@ package tests import ( "context" "fmt" - "runtime" "time" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" @@ -333,9 +332,6 @@ func registerJobsMixedVersions(r registry.Registry) { // vice versa in order to detect regressions in the work done for 20.1. Cluster: r.MakeClusterSpec(4), Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } predV, err := version.PredecessorVersion(*t.BuildVersion()) if err != nil { t.Fatal(err) diff --git a/pkg/cmd/roachtest/tests/mixed_version_schemachange.go b/pkg/cmd/roachtest/tests/mixed_version_schemachange.go index fb66c4003e60..990b2e18ca3c 100644 --- a/pkg/cmd/roachtest/tests/mixed_version_schemachange.go +++ b/pkg/cmd/roachtest/tests/mixed_version_schemachange.go @@ -13,7 +13,6 @@ package tests import ( "context" "fmt" - "runtime" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" @@ -32,9 +31,6 @@ func registerSchemaChangeMixedVersions(r registry.Registry) { Cluster: r.MakeClusterSpec(4), NativeLibs: registry.LibGEOS, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } maxOps := 100 concurrency := 5 if c.IsLocal() { diff --git a/pkg/cmd/roachtest/tests/rebalance_load.go b/pkg/cmd/roachtest/tests/rebalance_load.go index 154b9abc2e04..3ae35b08e001 100644 --- a/pkg/cmd/roachtest/tests/rebalance_load.go +++ b/pkg/cmd/roachtest/tests/rebalance_load.go @@ -14,7 +14,6 @@ import ( "context" "fmt" "math/rand" - "runtime" "strings" "time" @@ -193,9 +192,6 @@ func registerRebalanceLoad(r registry.Registry) { Cluster: r.MakeClusterSpec(4), // the last node is just used to generate load Leases: registry.MetamorphicLeases, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } if c.IsLocal() { concurrency = 32 fmt.Printf("lowering concurrency to %d in local testing\n", concurrency) diff --git a/pkg/cmd/roachtest/tests/restore.go b/pkg/cmd/roachtest/tests/restore.go index 4b265fe37303..3f87107a60b9 100644 --- a/pkg/cmd/roachtest/tests/restore.go +++ b/pkg/cmd/roachtest/tests/restore.go @@ -32,6 +32,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/roachprod/install" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" "github.com/cockroachdb/cockroach/pkg/ts/tspb" @@ -299,7 +300,7 @@ func registerRestore(r registry.Registry) { hardware: makeHardwareSpecs(hardwareSpecs{ nodes: 9, zones: []string{"us-east-2b", "us-west-2b", "eu-west-1b"}}), // These zones are AWS-specific. - backup: makeBackupSpecs(backupSpecs{}), + backup: makeBackupSpecs(backupSpecs{cloud: spec.AWS}), timeout: 90 * time.Minute, tags: registry.Tags("aws"), }, @@ -489,8 +490,9 @@ func (hw hardwareSpecs) makeClusterSpecs(r registry.Registry, backupCloud string // https://github.com/cockroachdb/cockroach/issues/98783. // // TODO(srosenberg): Remove this workaround when 98783 is addressed. - s.InstanceType = spec.AWSMachineType(s.CPUs, s.Mem) + s.InstanceType, _ = spec.AWSMachineType(s.CPUs, s.Mem, vm.ArchAMD64) s.InstanceType = strings.Replace(s.InstanceType, "d.", ".", 1) + s.Arch = vm.ArchAMD64 } return s } diff --git a/pkg/cmd/roachtest/tests/secondary_indexes.go b/pkg/cmd/roachtest/tests/secondary_indexes.go index 888eee17dffa..a26864022723 100644 --- a/pkg/cmd/roachtest/tests/secondary_indexes.go +++ b/pkg/cmd/roachtest/tests/secondary_indexes.go @@ -12,7 +12,6 @@ package tests import ( "context" - "runtime" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" @@ -140,9 +139,6 @@ func registerSecondaryIndexesMultiVersionCluster(r registry.Registry) { Owner: registry.OwnerSQLFoundations, Cluster: r.MakeClusterSpec(3), Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } predV, err := version.PredecessorVersion(*t.BuildVersion()) if err != nil { t.Fatal(err) diff --git a/pkg/cmd/roachtest/tests/tpcc.go b/pkg/cmd/roachtest/tests/tpcc.go index 22d09951bc7f..3d122a5e7425 100644 --- a/pkg/cmd/roachtest/tests/tpcc.go +++ b/pkg/cmd/roachtest/tests/tpcc.go @@ -17,7 +17,6 @@ import ( "math/rand" "os" "path/filepath" - "runtime" "strings" "time" @@ -29,6 +28,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" "github.com/cockroachdb/cockroach/pkg/roachprod/prometheus" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" "github.com/cockroachdb/cockroach/pkg/testutils/skip" "github.com/cockroachdb/cockroach/pkg/util/search" @@ -317,6 +317,7 @@ var tpccSupportedWarehouses = []struct { // TODO(tbg): this number is copied from gce-n4cpu16. The real number should be a // little higher, find out what it is. {hardware: "gce-n5cpu16", v: version.MustParse(`v19.1.0-0`), warehouses: 1300}, + {hardware: "aws-n5cpu16", v: version.MustParse(`v19.1.0-0`), warehouses: 2100}, // Ditto. {hardware: "gce-n5cpu16", v: version.MustParse(`v2.1.0-0`), warehouses: 1300}, } @@ -357,9 +358,6 @@ func maxSupportedTPCCWarehouses( func runTPCCMixedHeadroom( ctx context.Context, t test.Test, c cluster.Cluster, cloud string, versionsToUpgrade int, ) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } crdbNodes := c.Range(1, c.Spec().NodeCount-1) workloadNode := c.Node(c.Spec().NodeCount) @@ -525,13 +523,17 @@ func registerTPCC(r registry.Registry) { runTPCCMixedHeadroom(ctx, t, c, cloud, 1) }, }) + + // N.B. Multiple upgrades may require a released version < 22.2.x, which wasn't built for ARM64. + mixedHeadroomMultiUpgradesSpec := r.MakeClusterSpec(5, spec.CPU(16), spec.RandomlyUseZfs(), spec.Arch(vm.ArchAMD64)) + r.Add(registry.TestSpec{ // run the same mixed-headroom test, but going back two versions - Name: "tpcc/mixed-headroom/multiple-upgrades/" + mixedHeadroomSpec.String(), + Name: "tpcc/mixed-headroom/multiple-upgrades/" + mixedHeadroomMultiUpgradesSpec.String(), Timeout: 5 * time.Hour, Owner: registry.OwnerTestEng, Tags: registry.Tags(`default`), - Cluster: mixedHeadroomSpec, + Cluster: mixedHeadroomMultiUpgradesSpec, EncryptionSupport: registry.EncryptionMetamorphic, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { runTPCCMixedHeadroom(ctx, t, c, cloud, 2) diff --git a/pkg/cmd/roachtest/tests/validate_system_schema_after_version_upgrade.go b/pkg/cmd/roachtest/tests/validate_system_schema_after_version_upgrade.go index 63b023d1aa0c..90ed4d0941db 100644 --- a/pkg/cmd/roachtest/tests/validate_system_schema_after_version_upgrade.go +++ b/pkg/cmd/roachtest/tests/validate_system_schema_after_version_upgrade.go @@ -12,7 +12,6 @@ package tests import ( "context" - "runtime" "strings" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" @@ -36,9 +35,6 @@ func registerValidateSystemSchemaAfterVersionUpgrade(r registry.Registry) { Owner: registry.OwnerSQLFoundations, Cluster: r.MakeClusterSpec(1), Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } predecessorVersion, err := version.PredecessorVersion(*t.BuildVersion()) if err != nil { t.Fatal(err) diff --git a/pkg/cmd/roachtest/tests/version.go b/pkg/cmd/roachtest/tests/version.go index 940c5f3125f2..8cf4a2c44167 100644 --- a/pkg/cmd/roachtest/tests/version.go +++ b/pkg/cmd/roachtest/tests/version.go @@ -13,7 +13,6 @@ package tests import ( "context" "fmt" - "runtime" "strings" "time" @@ -223,9 +222,6 @@ func registerVersion(r registry.Registry) { Owner: registry.OwnerTestEng, Cluster: r.MakeClusterSpec(n + 1), Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } pred, err := version.PredecessorVersion(*t.BuildVersion()) if err != nil { t.Fatal(err) diff --git a/pkg/cmd/roachtest/tests/versionupgrade.go b/pkg/cmd/roachtest/tests/versionupgrade.go index 1e3e96b7f29b..e1d272a9bc2f 100644 --- a/pkg/cmd/roachtest/tests/versionupgrade.go +++ b/pkg/cmd/roachtest/tests/versionupgrade.go @@ -98,9 +98,6 @@ DROP TABLE splitmerge.t; } func runVersionUpgrade(ctx context.Context, t test.Test, c cluster.Cluster) { - if c.IsLocal() && runtime.GOARCH == "arm64" { - t.Skip("Skip under ARM64. See https://github.com/cockroachdb/cockroach/issues/89268") - } c.Put(ctx, t.DeprecatedWorkload(), "./workload", c.All()) mvt := mixedversion.NewTest(ctx, t, t.L(), c, c.All()) mvt.OnStartup("setup schema changer workload", func(ctx context.Context, l *logger.Logger, r *rand.Rand, helper *mixedversion.Helper) error { diff --git a/pkg/roachprod/install/BUILD.bazel b/pkg/roachprod/install/BUILD.bazel index 94b01bf661e8..e33256a62493 100644 --- a/pkg/roachprod/install/BUILD.bazel +++ b/pkg/roachprod/install/BUILD.bazel @@ -57,6 +57,7 @@ go_test( embed = [":install"], deps = [ "//pkg/roachprod/logger", + "//pkg/roachprod/vm", "//pkg/testutils/datapathutils", "//pkg/util/retry", "@com_github_cockroachdb_datadriven//:datadriven", diff --git a/pkg/roachprod/install/staging.go b/pkg/roachprod/install/staging.go index 09c31a6dd2fa..7a4382479fe5 100644 --- a/pkg/roachprod/install/staging.go +++ b/pkg/roachprod/install/staging.go @@ -17,6 +17,7 @@ import ( "path/filepath" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/cockroachdb/errors" ) @@ -98,30 +99,30 @@ var ( ) // ArchInfoForOS returns an ArchInfo for the given OS and Architecture if currently supported. -func ArchInfoForOS(os string, arch string) (archInfo, error) { - if arch != "" && arch != "amd64" && arch != "arm64" && arch != "fips" { +func ArchInfoForOS(os string, arch vm.CPUArch) (archInfo, error) { + if arch != "" && arch != vm.ArchAMD64 && arch != vm.ArchARM64 && arch != vm.ArchFIPS { return archInfo{}, errors.Errorf("unsupported architecture %q", arch) } switch os { case "linux": - if arch == "arm64" { + if arch == vm.ArchARM64 { return linux_arm64_ArchInfo, nil } - if arch == "fips" { + if arch == vm.ArchFIPS { return linux_x86_64_fips_ArchInfo, nil } return linux_x86_64_ArchInfo, nil case "darwin": - if arch == "arm64" { + if arch == vm.ArchARM64 { return darwin_arm64_ArchInfo, nil } - if arch == "fips" { + if arch == vm.ArchFIPS { return archInfo{}, errors.Errorf("%q is not supported on %q", arch, os) } return darwin_x86_64_ArchInfo, nil case "windows": - if arch == "fips" || arch == "arm64" { + if arch == vm.ArchFIPS || arch == vm.ArchARM64 { return archInfo{}, errors.Errorf("%q is not supported on %q", arch, os) } return windowsArchInfo, nil @@ -176,7 +177,7 @@ func StageApplication( applicationName string, version string, os string, - arch string, + arch vm.CPUArch, destDir string, ) error { archInfo, err := ArchInfoForOS(os, arch) @@ -226,7 +227,7 @@ func StageApplication( // URLsForApplication returns a slice of URLs that should be // downloaded for the given application. func URLsForApplication( - application string, version string, os string, arch string, + application string, version string, os string, arch vm.CPUArch, ) ([]*url.URL, error) { archInfo, err := ArchInfoForOS(os, arch) if err != nil { diff --git a/pkg/roachprod/install/staging_test.go b/pkg/roachprod/install/staging_test.go index 3455df5fff79..977d755bf401 100644 --- a/pkg/roachprod/install/staging_test.go +++ b/pkg/roachprod/install/staging_test.go @@ -13,6 +13,7 @@ package install import ( "testing" + "github.com/cockroachdb/cockroach/pkg/roachprod/vm" "github.com/stretchr/testify/require" ) @@ -322,7 +323,7 @@ func TestURLsForApplication(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got, err := URLsForApplication(tt.args.application, tt.args.version, tt.args.os, tt.args.arch) + got, err := URLsForApplication(tt.args.application, tt.args.version, tt.args.os, vm.CPUArch(tt.args.arch)) if (err != nil) != tt.wantErr { t.Errorf("URLsForApplication() error = %v, wantErr %v", err, tt.wantErr) return diff --git a/pkg/roachprod/roachprod.go b/pkg/roachprod/roachprod.go index 9b3353d816bb..cea021f2eec8 100644 --- a/pkg/roachprod/roachprod.go +++ b/pkg/roachprod/roachprod.go @@ -538,7 +538,7 @@ func Stage( dir = stageDir } - return install.StageApplication(ctx, l, c, applicationName, version, os, arch, dir) + return install.StageApplication(ctx, l, c, applicationName, version, os, vm.CPUArch(arch), dir) } // Reset resets all VMs in a cluster. @@ -1413,7 +1413,7 @@ func StageURL( if stageArch != "" { arch = stageArch } - urls, err := install.URLsForApplication(applicationName, version, os, arch) + urls, err := install.URLsForApplication(applicationName, version, os, vm.CPUArch(arch)) if err != nil { return nil, err } diff --git a/pkg/roachprod/vm/aws/aws.go b/pkg/roachprod/vm/aws/aws.go index bd5f1376fbac..b49fa8e3ec49 100644 --- a/pkg/roachprod/vm/aws/aws.go +++ b/pkg/roachprod/vm/aws/aws.go @@ -456,12 +456,13 @@ func (p *Provider) Create( var g errgroup.Group limiter := rate.NewLimiter(rate.Limit(providerOpts.CreateRateLimit), 2 /* buckets */) for i := range names { + index := i capName := names[i] placement := zones[i] res := limiter.Reserve() g.Go(func() error { time.Sleep(res.Delay()) - return p.runInstance(l, capName, placement, opts, providerOpts) + return p.runInstance(l, capName, index, placement, opts, providerOpts) }) } if err := g.Wait(); err != nil { @@ -914,7 +915,12 @@ func (p *Provider) listRegion( // we need to do a bit of work to look up all of the various ids that // we need in order to actually allocate an instance. func (p *Provider) runInstance( - l *logger.Logger, name string, zone string, opts vm.CreateOpts, providerOpts *ProviderOpts, + l *logger.Logger, + name string, + instanceIdx int, + zone string, + opts vm.CreateOpts, + providerOpts *ProviderOpts, ) error { // There exist different flags to control the machine type when ssd is true. // This enables sane defaults for either setting but the behavior can be @@ -1005,7 +1011,7 @@ func (p *Provider) runInstance( extraMountOpts = "nobarrier" } } - filename, err := writeStartupScript(name, extraMountOpts, providerOpts.UseMultipleDisks, opts.EnableFIPS) + filename, err := writeStartupScript(name, extraMountOpts, providerOpts.UseMultipleDisks, opts.Arch == string(vm.ArchFIPS)) if err != nil { return errors.Wrapf(err, "could not write AWS startup script to temp file") } @@ -1021,14 +1027,22 @@ func (p *Provider) runInstance( } imageID := withFlagOverride(az.region.AMI_X86_64, &providerOpts.ImageAMI) useArmAMI := strings.Index(machineType, "6g.") == 1 || strings.Index(machineType, "7g.") == 1 + if useArmAMI && (opts.Arch != "" && opts.Arch != string(vm.ArchARM64)) { + return errors.Errorf("machine type %s is arm64, but requested arch is %s", machineType, opts.Arch) + } //TODO(srosenberg): remove this once we have a better way to detect ARM64 machines if useArmAMI { imageID = withFlagOverride(az.region.AMI_ARM64, &providerOpts.ImageAMI) - l.Printf("Using ARM64 AMI: %s for machine type: %s", imageID, machineType) + // N.B. use arbitrary instanceIdx to suppress the same info for every other instance being created. + if instanceIdx == 0 { + l.Printf("Using ARM64 AMI: %s for machine type: %s", imageID, machineType) + } } - if !useArmAMI && opts.EnableFIPS { + if opts.Arch == string(vm.ArchFIPS) { imageID = withFlagOverride(az.region.AMI_FIPS, &providerOpts.ImageAMI) - l.Printf("Using FIPS-enabled AMI: %s for machine type: %s", imageID, machineType) + if instanceIdx == 0 { + l.Printf("Using FIPS-enabled AMI: %s for machine type: %s", imageID, machineType) + } } args := []string{ "ec2", "run-instances", diff --git a/pkg/roachprod/vm/gce/gcloud.go b/pkg/roachprod/vm/gce/gcloud.go index 3eeadacb6060..7e5d5baeac05 100644 --- a/pkg/roachprod/vm/gce/gcloud.go +++ b/pkg/roachprod/vm/gce/gcloud.go @@ -40,6 +40,7 @@ const ( // ProviderName is gce. ProviderName = "gce" DefaultImage = "ubuntu-2004-focal-v20210603" + ARM64Image = "ubuntu-2004-focal-arm64-v20230302" FIPSImage = "ubuntu-pro-fips-2004-focal-v20230302" defaultImageProject = "ubuntu-os-cloud" FIPSImageProject = "ubuntu-os-pro-cloud" @@ -884,10 +885,34 @@ func (p *Provider) Create( // Fixed args. image := providerOpts.Image imageProject := defaultImageProject - if opts.EnableFIPS { + useArmAMI := strings.HasPrefix(strings.ToLower(providerOpts.MachineType), "t2a-") + if useArmAMI && (opts.Arch != "" && opts.Arch != string(vm.ArchARM64)) { + return errors.Errorf("machine type %s is arm64, but requested arch is %s", providerOpts.MachineType, opts.Arch) + } + if useArmAMI && opts.SSDOpts.UseLocalSSD { + return errors.New("local SSDs are not supported with T2A instances, use --local-ssd=false") + } + if useArmAMI { + if len(providerOpts.Zones) == 0 { + zones = []string{"us-central1-a"} + } else { + for _, zone := range providerOpts.Zones { + if !strings.HasPrefix(zone, "us-central1-") { + return errors.New("T2A instances are not supported outside of us-central1") + } + } + } + } + //TODO(srosenberg): remove this once we have a better way to detect ARM64 machines + if useArmAMI { + image = ARM64Image + l.Printf("Using ARM64 AMI: %s for machine type: %s", image, providerOpts.MachineType) + } + if opts.Arch == string(vm.ArchFIPS) { // NB: if FIPS is enabled, it overrides the image passed via CLI (--gce-image) image = FIPSImage imageProject = FIPSImageProject + l.Printf("Using FIPS-enabled AMI: %s for machine type: %s", image, providerOpts.MachineType) } args := []string{ "compute", "instances", "create", @@ -958,7 +983,7 @@ func (p *Provider) Create( } // Create GCE startup script file. - filename, err := writeStartupScript(extraMountOpts, opts.SSDOpts.FileSystem, providerOpts.UseMultipleDisks, opts.EnableFIPS) + filename, err := writeStartupScript(extraMountOpts, opts.SSDOpts.FileSystem, providerOpts.UseMultipleDisks, opts.Arch == string(vm.ArchFIPS)) if err != nil { return errors.Wrapf(err, "could not write GCE startup script to temp file") } diff --git a/pkg/roachprod/vm/vm.go b/pkg/roachprod/vm/vm.go index bc28a72cc294..ce7fbdc41e2d 100644 --- a/pkg/roachprod/vm/vm.go +++ b/pkg/roachprod/vm/vm.go @@ -38,14 +38,23 @@ const ( // TagUsage indicates where a certain resource is used. "roachtest" is used // as the key for roachtest created resources. TagUsage = "usage" + // TagArch is the CPU architecture tag const. + TagArch = "arch" + + ArchARM64 = CPUArch("arm64") + ArchAMD64 = CPUArch("amd64") + ArchFIPS = CPUArch("fips") ) +type CPUArch string + // GetDefaultLabelMap returns a label map for a common set of labels. func GetDefaultLabelMap(opts CreateOpts) map[string]string { return map[string]string{ TagCluster: opts.ClusterName, TagLifetime: opts.Lifetime.String(), TagRoachprod: "true", + TagArch: opts.Arch, } } @@ -230,7 +239,7 @@ type CreateOpts struct { CustomLabels map[string]string GeoDistributed bool - EnableFIPS bool + Arch string VMProviders []string SSDOpts struct { UseLocalSSD bool @@ -251,7 +260,8 @@ func DefaultCreateOpts() CreateOpts { GeoDistributed: false, VMProviders: []string{}, OsVolumeSize: 10, - CustomLabels: map[string]string{"roachtest": "true"}, + // N.B. When roachprod is used via CLI, this will be overridden by {"roachprod":"true"}. + CustomLabels: map[string]string{"roachtest": "true"}, } defaultCreateOpts.SSDOpts.UseLocalSSD = true defaultCreateOpts.SSDOpts.NoExt4Barrier = true diff --git a/pkg/util/randutil/rand.go b/pkg/util/randutil/rand.go index 2a0755fdb910..ad1543e80fc8 100644 --- a/pkg/util/randutil/rand.go +++ b/pkg/util/randutil/rand.go @@ -98,6 +98,12 @@ func NewPseudoRand() (*rand.Rand, int64) { return rand.New(rand.NewSource(seed)), seed } +// Same as NewPseudoRand, but the returned Rand is using thread safe underlying source. +func NewLockedPseudoRand() (*rand.Rand, int64) { + seed := envutil.EnvOrDefaultInt64("COCKROACH_RANDOM_SEED", NewPseudoSeed()) + return rand.New(NewLockedSource(seed)), seed +} + // NewTestRand returns an instance of math/rand.Rand seeded from rng, which is // seeded with the global seed. If the caller is a test with a different // path-qualified name than the previous caller, rng is reseeded from the global