Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
114695: builtins: remove crdb_internal.gc_tenant r=rafiss a=rafiss

This is an internal function that was deprecated a while ago. It's no longer used since DestroyTenant is used instead.

Epic: None
Release note: None

115559: roachtest: port rebalance/by-load/*/mixed-version to new framework r=nvanbenschoten a=kvoli

Port `rebalance/by-load/leases/mixed-version` and
`rebalance/by-load/replicas/mixed-version` to the new mixed-version
framework as described in #110528.

Part of: #110528
Resolves: #115134
Resolves: #115135
Release note: None


---

`ts_util.go` provides helper functions for querying the cluster
timeseries database over http. These methods previously made use of
`httputil.PostJSON` for requesting timeseries. This commit updates to
use `httputil.PostProtobuf`, in order to allow ignoring optional fields
when the server doesn't support them; namely `TenantID`.

Epic: none
Release note: None

115732: workflows: update to run unit tests in GitHub Actions job r=rail a=rickystewart

Epic: CRDB-8308
Release note: None

Co-authored-by: Rafi Shamim <[email protected]>
Co-authored-by: Austen McClernon <[email protected]>
Co-authored-by: Ricky Stewart <[email protected]>
  • Loading branch information
4 people committed Dec 8, 2023
4 parents 099399b + 25acf63 + 55e2d6f + 9452d4b commit 8475571
Show file tree
Hide file tree
Showing 12 changed files with 162 additions and 208 deletions.
11 changes: 10 additions & 1 deletion .github/workflows/experimental-github-actions-testing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,13 @@ jobs:
test_job:
runs-on: [self-hosted, basic_runner_group]
steps:
- run: echo hi
- uses: actions/checkout@v4
with:
# By default, checkout merges the PR into the current master.
# Instead, we want to check out the PR as is.
ref: ${{ github.event.pull_request.head.sha }}
- run: ./build/github/get-engflow-keys.sh
- run: bazel test //pkg:all_tests --config engflowpublic --config crosslinux --jobs 300 --tls_client_certificate=/home/agent/engflow.crt --tls_client_key=/home/agent/engflow.key --remote_download_minimal
- name: clean up
if: always()
run: rm -f /home/agent/engflow.*
6 changes: 6 additions & 0 deletions build/github/get-engflow-keys.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/usr/bin/env bash

set -euxo pipefail

gcloud secrets versions access 1 --secret=engflow-mesolite-key > /home/agent/engflow.key
gcloud secrets versions access 1 --secret=engflow-mesolite-crt > /home/agent/engflow.crt
2 changes: 1 addition & 1 deletion pkg/cmd/roachtest/tests/admission_control_tpcc_overload.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ func verifyNodeLiveness(
if err := retry.WithMaxAttempts(ctx, retry.Options{
MaxBackoff: 500 * time.Millisecond,
}, 60, func() (err error) {
response, err = getMetrics(adminURLs[0], now.Add(-runDuration), now, []tsQuery{
response, err = getMetrics(ctx, adminURLs[0], now.Add(-runDuration), now, []tsQuery{
{
name: "cr.node.liveness.heartbeatfailures",
queryType: total,
Expand Down
4 changes: 2 additions & 2 deletions pkg/cmd/roachtest/tests/disk_stall.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ func runDiskStalledDetection(
}

stallAt := timeutil.Now()
response := mustGetMetrics(t, adminURL, workloadStartAt, stallAt, []tsQuery{
response := mustGetMetrics(ctx, t, adminURL, workloadStartAt, stallAt, []tsQuery{
{name: "cr.node.sql.query.count", queryType: total},
})
cum := response.Results[0].Datapoints
Expand Down Expand Up @@ -200,7 +200,7 @@ func runDiskStalledDetection(

{
now := timeutil.Now()
response := mustGetMetrics(t, adminURL, workloadStartAt, now, []tsQuery{
response := mustGetMetrics(ctx, t, adminURL, workloadStartAt, now, []tsQuery{
{name: "cr.node.sql.query.count", queryType: total},
})
cum := response.Results[0].Datapoints
Expand Down
250 changes: 131 additions & 119 deletions pkg/cmd/roachtest/tests/rebalance_load.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@ import (
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/mixedversion"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
"github.com/cockroachdb/cockroach/pkg/testutils/release"
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/errors"
"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -82,134 +82,56 @@ func registerRebalanceLoad(r registry.Registry) {
appNode := c.Node(c.Spec().NodeCount)
numNodes := len(roachNodes)
numStores := numNodes
startOpts.RoachprodOpts.ExtraArgs = append(startOpts.RoachprodOpts.ExtraArgs,
"--vmodule=store_rebalancer=5,allocator=5,allocator_scorer=5,replicate_queue=5")
if c.Spec().SSDs > 1 && !c.Spec().RAID0 {
numStores *= c.Spec().SSDs
startOpts.RoachprodOpts.StoreCount = c.Spec().SSDs
}
// We want each store to end up with approximately storeToRangeFactor
// (factor) leases such that the CPU load is evenly spread, e.g.
// (n * factor) -1 splits = factor * n ranges = factor leases per store
// Note that we only assert on the CPU of each store w.r.t the mean, not
// the lease count.
splits := (numStores * storeToRangeFactor) - 1
startOpts.RoachprodOpts.ExtraArgs = append(startOpts.RoachprodOpts.ExtraArgs,
"--vmodule=store_rebalancer=5,allocator=5,allocator_scorer=5,replicate_queue=5")

settings := install.MakeClusterSettings()
settings.ClusterSettings["kv.allocator.load_based_rebalancing"] = rebalanceMode
settings.ClusterSettings["kv.range_split.by_load_enabled"] = "false"

if mixedVersion {
predecessorVersionStr, err := release.LatestPredecessor(t.BuildVersion())
require.NoError(t, err)
predecessorVersion := clusterupgrade.MustParseVersion(predecessorVersionStr)
settings.Binary = uploadCockroach(ctx, t, c, c.All(), predecessorVersion)
// Upgrade some (or all) of the first N-1 CRDB nodes. We ignore the last
// CRDB node (to leave at least one node on the older version), and the
// app node.
lastNodeToUpgrade := rand.Intn(c.Spec().NodeCount-2) + 1
t.L().Printf("upgrading %d nodes to the current cockroach binary", lastNodeToUpgrade)
nodesToUpgrade := c.Range(1, lastNodeToUpgrade)
c.Start(ctx, t.L(), startOpts, settings, roachNodes)
upgradeNodes(ctx, t, c, nodesToUpgrade, startOpts, clusterupgrade.CurrentVersion())
mvt := mixedversion.NewTest(ctx, t, t.L(), c, roachNodes, mixedversion.NeverUseFixtures,
// The http requests to the admin UI performed by the test don't play
// well with secure clusters. As of the time of writing, they return
// either of the following errors:
// tls: failed to verify certificate: x509: “node” certificate is not standards compliant
// tls: failed to verify certificate: x509: certificate signed by unknown authority
//
// Disable secure mode for simplicity.
mixedversion.ClusterSettingOption(install.SecureOption(false)),
mixedversion.ClusterSettingOption(install.ClusterSettingsOption(settings.ClusterSettings)),
)
mvt.InMixedVersion("rebalance load run",
func(ctx context.Context, l *logger.Logger, r *rand.Rand, h *mixedversion.Helper) error {
return rebalanceByLoad(
ctx, t, c, rebalanceMode, maxDuration, concurrency, appNode, numStores, numNodes)
})
mvt.Run()
} else {
// Enable collecting CPU profiles when the CPU utilization exceeds 90%.
// This helps debug failures which occur as a result of mismatches
// between allocation (QPS/replica CPU) and hardware signals e.g. see
// #111900. The setting names changed between v22.2 and v23.1, we can't
// easily setup CPU profiling in mixed version tests.
//
// TODO(kvoli): Remove this setup once CPU profiling is enabled by default
// on perf roachtests #97699.
settings.ClusterSettings["server.cpu_profile.duration"] = "2s"
settings.ClusterSettings["server.cpu_profile.interval"] = "2"
settings.ClusterSettings["server.cpu_profile.cpu_usage_combined_threshold"] = "90"
c.Start(ctx, t.L(), startOpts, settings, roachNodes)
require.NoError(t, rebalanceByLoad(
ctx, t, c, rebalanceMode, maxDuration,
concurrency, appNode, numStores, numNodes,
))
}

c.Put(ctx, t.DeprecatedWorkload(), "./workload", appNode)
c.Run(ctx, appNode, fmt.Sprintf("./workload init kv --drop --splits=%d {pgurl:1}", splits))

db := c.Conn(ctx, t.L(), 1)
defer db.Close()

require.NoError(t, WaitFor3XReplication(ctx, t, db))
t.Status("disable load based splitting")
require.NoError(t, disableLoadBasedSplitting(ctx, db))
t.Status(fmt.Sprintf("setting rebalance mode to %s", rebalanceMode))
_, err := db.ExecContext(ctx, `SET CLUSTER SETTING kv.allocator.load_based_rebalancing=$1::string`, rebalanceMode)
require.NoError(t, err)
// Enable collecting CPU profiles when the CPU utilization exceeds 90%.
// This helps debug failures which occur as a result of mismatches
// between allocation (QPS/replica CPU) and hardware signals e.g. see
// #111900.
//
// TODO(kvoli): Remove this setup once CPU profiling is enabled by default
// on perf roachtests #97699.
_, err = db.ExecContext(ctx, `SET CLUSTER SETTING server.cpu_profile.duration = '2s'`)
require.NoError(t, err)
_, err = db.ExecContext(ctx, `SET CLUSTER SETTING server.cpu_profile.interval = '2m'`)
require.NoError(t, err)
_, err = db.ExecContext(ctx, `SET CLUSTER SETTING server.cpu_profile.cpu_usage_combined_threshold = 90`)
require.NoError(t, err)

var m *errgroup.Group // see comment in version.go
m, ctx = errgroup.WithContext(ctx)

// Enable us to exit out of workload early when we achieve the desired CPU
// balance. This drastically shortens the duration of the test in the
// common case.
ctx, cancel := context.WithCancel(ctx)

m.Go(func() error {
t.L().Printf("starting load generator\n")
err := c.RunE(ctx, appNode, fmt.Sprintf(
"./workload run kv --read-percent=95 --tolerate-errors --concurrency=%d "+
"--duration=%v {pgurl:1-%d}",
concurrency, maxDuration, len(roachNodes)))
if errors.Is(ctx.Err(), context.Canceled) {
// We got canceled either because CPU balance was achieved or the
// other worker hit an error. In either case, it's not this worker's
// fault.
return nil
}
return err
})

m.Go(func() error {
t.Status("checking for CPU balance")

storeCPUFn, err := makeStoreCPUFn(ctx, c, t, numNodes, numStores)
if err != nil {
return err
}

var reason string
var balancedStartTime time.Time
var prevIsBalanced bool
for tBegin := timeutil.Now(); timeutil.Since(tBegin) <= maxDuration; {
// Wait out the sample period initially to allow the timeseries to
// populate meaningful information for the test to query.
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(statSamplePeriod):
}

now := timeutil.Now()
clusterStoresCPU, err := storeCPUFn(ctx)
if err != nil {
t.L().Printf("unable to get the cluster stores CPU %s\n", err.Error())
continue
}
var curIsBalanced bool
curIsBalanced, reason = isLoadEvenlyDistributed(clusterStoresCPU, meanCPUTolerance)
t.L().Printf("cpu %s", reason)
if !prevIsBalanced && curIsBalanced {
balancedStartTime = now
}
prevIsBalanced = curIsBalanced
if prevIsBalanced && now.Sub(balancedStartTime) > stableDuration {
t.Status("successfully achieved CPU balance; waiting for kv to finish running")
cancel()
return nil
}
}

return errors.Errorf("CPU not evenly balanced after timeout: %s", reason)
})
if err := m.Wait(); err != nil {
t.Fatal(err)
}
}

concurrency := 128

r.Add(
registry.TestSpec{
Name: `rebalance/by-load/leases`,
Expand Down Expand Up @@ -301,6 +223,96 @@ func registerRebalanceLoad(r registry.Registry) {
)
}

func rebalanceByLoad(
ctx context.Context,
t test.Test,
c cluster.Cluster,
rebalanceMode string,
maxDuration time.Duration,
concurrency int,
appNode option.NodeListOption,
numStores, numNodes int,
) error {
// We want each store to end up with approximately storeToRangeFactor
// (factor) leases such that the CPU load is evenly spread, e.g.
// (n * factor) -1 splits = factor * n ranges = factor leases per store
// Note that we only assert on the CPU of each store w.r.t the mean, not
// the lease count.
splits := (numStores * storeToRangeFactor) - 1
c.Run(ctx, appNode, fmt.Sprintf("./cockroach workload init kv --drop --splits=%d {pgurl:1}", splits))

db := c.Conn(ctx, t.L(), 1)
defer db.Close()

require.NoError(t, WaitFor3XReplication(ctx, t, db))

var m *errgroup.Group
m, ctx = errgroup.WithContext(ctx)

// Enable us to exit out of workload early when we achieve the desired CPU
// balance. This drastically shortens the duration of the test in the
// common case.
ctx, cancel := context.WithCancel(ctx)

m.Go(func() error {
t.L().Printf("starting load generator\n")
err := c.RunE(ctx, appNode, fmt.Sprintf(
"./cockroach workload run kv --read-percent=95 --tolerate-errors --concurrency=%d "+
"--duration=%v {pgurl:1-%d}",
concurrency, maxDuration, numNodes))
if errors.Is(ctx.Err(), context.Canceled) {
// We got canceled either because CPU balance was achieved or the
// other worker hit an error. In either case, it's not this worker's
// fault.
return nil
}
return err
})

m.Go(func() error {
t.Status("checking for CPU balance")

storeCPUFn, err := makeStoreCPUFn(ctx, c, t, numNodes, numStores)
if err != nil {
return err
}

var reason string
var balancedStartTime time.Time
var prevIsBalanced bool
for tBegin := timeutil.Now(); timeutil.Since(tBegin) <= maxDuration; {
// Wait out the sample period initially to allow the timeseries to
// populate meaningful information for the test to query.
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(statSamplePeriod):
}

now := timeutil.Now()
clusterStoresCPU, err := storeCPUFn(ctx)
if err != nil {
t.L().Printf("unable to get the cluster stores CPU %s\n", err.Error())
continue
}
var curIsBalanced bool
curIsBalanced, reason = isLoadEvenlyDistributed(clusterStoresCPU, meanCPUTolerance)
t.L().Printf("cpu %s", reason)
if !prevIsBalanced && curIsBalanced {
balancedStartTime = now
}
prevIsBalanced = curIsBalanced
if prevIsBalanced && now.Sub(balancedStartTime) > stableDuration {
t.Status("successfully achieved CPU balance; waiting for kv to finish running")
cancel()
return nil
}
}
return errors.Errorf("CPU not evenly balanced after timeout: %s", reason)
})
return m.Wait()
}

// makeStoreCPUFn returns a function which can be called to gather the CPU of
// the cluster stores. When there are multiple stores per node, stores on the
// same node will report identical CPU.
Expand All @@ -325,7 +337,7 @@ func makeStoreCPUFn(
return func(ctx context.Context) ([]float64, error) {
now := timeutil.Now()
resp, err := getMetricsWithSamplePeriod(
url, startTime, now, statSamplePeriod, tsQueries)
ctx, url, startTime, now, statSamplePeriod, tsQueries)
if err != nil {
return nil, err
}
Expand Down
Loading

0 comments on commit 8475571

Please sign in to comment.