Skip to content

Commit

Permalink
roachtest: metamorphic ARM64 and FIPS clusters
Browse files Browse the repository at this point in the history
Previously, all roachtests used (cloud) machine types
with the AMD64 (cpu) architecture. Recently [1], new
CI infrastructure was added to run a clone of all the
nightly roachtests, configured with FIPS; i.e., same
AMD64 machine types, different AMI and crdb binary,
patched with FIPS-certified openssl native code.

As of this PR, we add the capability to execute any
roachtest in a cluster, configured with either
ARM64, FIPS, or AMD64 (default). This is controlled
via the two CLI args: `metamorphic-arm64-probability`
and `metamorphic-fips-probability`. The former denotes
the probability (over the uniform distribution) of a new
cluster provisioned using ARM64 VMs. The latter denotes
the probability of a new AMD64 cluster provisioned
with the FIPS-compliant (kernel) configuration.
In case a test is compatible only with AMD64, it's
effectively excluded from the set; i.e., both
probabilities apply to compatible tests only.

Note, the two probabilties don't have to add up to 1.
E.g., `metamorphic-arm64-probability==0.4`,
`metamorphic-fips-probability==0.2` denotes that ARM64
clusters are chosen ~40% of the time, whereas of the
remaining ~60% AMD clusters, FIPS is chosen ~20%
of the time; i.e., ~12% of all clusters will use FIPS.

Note, the values '0' and '1' are absolute. Setting both
to '0' is tantamount to the behavior before this PR.
Setting either to '1' enforces _all_ clusters
are provisioned with either ARM64 or FIPS.
A test can specify its required architecture, in which
case, it takes precedence over metamorphic settings.

This PR builds on [1], which enabled ARM64 provisioning
for AWS in roachprod. We add ARM64 provisioning for GCE,
i.e., T2A, as well as refactor 'arch' argument to
denote one of: AMD64, ARM64, FIPS, where the latter
isn't formally a CPU architecture; however, it simplifies
provisioning and binary staging.
We also modify roachprod.List to display CPU architecture,
other than AMD64, with the machine type; this should make it
easier to see which clusters are running ARM64 and FIPS
configurations, as we ramp up their testing.

The PR also adds validation to cockroach binaries and libs
to ensure we can execute tests under ARM64 and FIPS.
Furthermore, we add 'Enabled Assertions' header, generated
at build time, to the cockroach binary; the header is used
to validate whether or not the binary has runtime assertions
enabled.

Epic: none
Release note: None

Resolves: #94957
Resolves: #89268
Informs: #94986

[1] #99224
[2] #103243
  • Loading branch information
srosenberg committed Jun 13, 2023
1 parent c2606a5 commit 1fa4fac
Show file tree
Hide file tree
Showing 42 changed files with 792 additions and 392 deletions.
1 change: 1 addition & 0 deletions pkg/build/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ go_library(
"github.com/cockroachdb/cockroach/pkg/build.utcTime": "{BUILD_UTCTIME}",
},
deps = [
"//pkg/util/buildutil",
"//pkg/util/envutil",
"//pkg/util/version",
],
Expand Down
38 changes: 21 additions & 17 deletions pkg/build/info.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"text/tabwriter"
"time"

"github.com/cockroachdb/cockroach/pkg/util/buildutil"
"github.com/cockroachdb/cockroach/pkg/util/envutil"
"github.com/cockroachdb/cockroach/pkg/util/version"
)
Expand All @@ -35,11 +36,12 @@ var (
cgoTargetTriple string
platform = fmt.Sprintf("%s %s", runtime.GOOS, runtime.GOARCH)
// Distribution is changed by the CCL init-time hook in non-APL builds.
Distribution = "OSS"
typ string // Type of this build: <empty>, "development", or "release"
channel = "unknown"
envChannel = envutil.EnvOrDefaultString("COCKROACH_CHANNEL", "unknown")
binaryVersion = computeVersion(tag)
Distribution = "OSS"
typ string // Type of this build: <empty>, "development", or "release"
channel = "unknown"
envChannel = envutil.EnvOrDefaultString("COCKROACH_CHANNEL", "unknown")
enabledAssertions = buildutil.CrdbTestBuild
binaryVersion = computeVersion(tag)
)

const (
Expand Down Expand Up @@ -113,7 +115,8 @@ func (b Info) Long() string {
fmt.Fprintf(tw, "Go Version: %s\n", b.GoVersion)
fmt.Fprintf(tw, "C Compiler: %s\n", b.CgoCompiler)
fmt.Fprintf(tw, "Build Commit ID: %s\n", b.Revision)
fmt.Fprintf(tw, "Build Type: %s", b.Type) // No final newline: cobra prints one for us.
fmt.Fprintf(tw, "Build Type: %s\n", b.Type)
fmt.Fprintf(tw, "Enabled Assertions: %t", b.EnabledAssertions) // No final newline: cobra prints one for us.
_ = tw.Flush()
return buf.String()
}
Expand All @@ -139,17 +142,18 @@ func (b Info) Timestamp() (int64, error) {
// GetInfo returns an Info struct populated with the build information.
func GetInfo() Info {
return Info{
GoVersion: runtime.Version(),
Tag: tag,
Time: utcTime,
Revision: rev,
CgoCompiler: cgoCompiler,
CgoTargetTriple: cgoTargetTriple,
Platform: platform,
Distribution: Distribution,
Type: typ,
Channel: channel,
EnvChannel: envChannel,
GoVersion: runtime.Version(),
Tag: tag,
Time: utcTime,
Revision: rev,
CgoCompiler: cgoCompiler,
CgoTargetTriple: cgoTargetTriple,
Platform: platform,
Distribution: Distribution,
Type: typ,
Channel: channel,
EnvChannel: envChannel,
EnabledAssertions: enabledAssertions,
}
}

Expand Down
2 changes: 2 additions & 0 deletions pkg/build/info.proto
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ message Info {
optional string channel = 9 [(gogoproto.nullable) = false];
// env_channel identifies the product channel as overridden by the COCKROACH_CHANNEL environment variable.
optional string env_channel = 11 [(gogoproto.nullable) = false];
// enabled_assertions returns the value of 'CrdbTestBuild' (true iff compiled with 'crdb_test' tag)
optional bool enabled_assertions = 12 [(gogoproto.nullable) = false];

// dependencies exists to allow tests that run against old clusters
// to unmarshal JSON containing this field. The tag is unimportant,
Expand Down
9 changes: 7 additions & 2 deletions pkg/cmd/roachprod/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ var (
extendLifetime time.Duration
wipePreserveCerts bool
grafanaConfig string
grafanaArch string
grafanaurlOpen bool
grafanaDumpDir string
listDetails bool
Expand Down Expand Up @@ -107,8 +108,9 @@ func initFlags() {
vm.AllProviderNames()))
createCmd.Flags().BoolVar(&createVMOpts.GeoDistributed,
"geo", false, "Create geo-distributed cluster")
createCmd.Flags().BoolVar(&createVMOpts.EnableFIPS,
"fips", false, "Enable FIPS mode (uses custom AMI)")
createCmd.Flags().StringVar(&createVMOpts.Arch, "arch", "",
"architecture override for VM [amd64, arm64, fips]; N.B. fips implies amd64 with openssl")

// N.B. We set "usage=roachprod" as the default, custom label for billing tracking.
createCmd.Flags().StringToStringVar(&createVMOpts.CustomLabels,
"label", map[string]string{"usage": "roachprod"},
Expand Down Expand Up @@ -249,6 +251,9 @@ func initFlags() {
grafanaStartCmd.Flags().StringVar(&grafanaConfig,
"grafana-config", "", "URL to grafana json config")

grafanaStartCmd.Flags().StringVar(&grafanaArch, "arch", "",
"binary architecture override [amd64, arm64]")

grafanaURLCmd.Flags().BoolVar(&grafanaurlOpen,
"open", false, "open the grafana dashboard url on the browser")

Expand Down
17 changes: 11 additions & 6 deletions pkg/cmd/roachprod/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,7 @@ hosts file.
c.PrintDetails(roachprodLibraryLogger)
} else {
fmt.Fprintf(tw, "%s\t%s\t%d", c.Name, c.Clouds(), len(c.VMs))

if !c.IsLocal() {
fmt.Fprintf(tw, "\t(%s)", c.LifetimeRemaining().Round(time.Second))
} else {
Expand Down Expand Up @@ -904,10 +905,14 @@ var grafanaStartCmd = &cobra.Command{
Use: `grafana-start <cluster>`,
Short: `spins up a prometheus and grafana instances on the last node in the cluster`,
Long: `spins up a prometheus and grafana instances on the highest numbered node in the cluster
and will scrape from all nodes in the cluster`,
and will scrape from all nodes in the cluster; NOTE: for arm64 clusters, use --arch arm64`,
Args: cobra.ExactArgs(1),
Run: wrap(func(cmd *cobra.Command, args []string) error {
return roachprod.StartGrafana(context.Background(), roachprodLibraryLogger, args[0],
arch := vm.ArchAMD64
if grafanaArch == "arm64" {
arch = vm.ArchARM64
}
return roachprod.StartGrafana(context.Background(), roachprodLibraryLogger, args[0], arch,
grafanaConfig, nil)
}),
}
Expand Down Expand Up @@ -954,14 +959,14 @@ func validateAndConfigure(cmd *cobra.Command, args []string) {

// Validate architecture flag, if set.
if archOpt := cmd.Flags().Lookup("arch"); archOpt != nil && archOpt.Changed {
arch := strings.ToLower(archOpt.Value.String())
arch := vm.CPUArch(strings.ToLower(archOpt.Value.String()))

if arch != "amd64" && arch != "arm64" && arch != "fips" {
if arch != vm.ArchAMD64 && arch != vm.ArchARM64 && arch != vm.ArchFIPS {
printErrAndExit(fmt.Errorf("unsupported architecture %q", arch))
}
if arch != archOpt.Value.String() {
if string(arch) != archOpt.Value.String() {
// Set the canonical value.
_ = cmd.Flags().Set("arch", arch)
_ = cmd.Flags().Set("arch", string(arch))
}
}
}
Expand Down
Loading

0 comments on commit 1fa4fac

Please sign in to comment.