Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

vtctld/vtorc: improve reparenting stats #13723

21 changes: 20 additions & 1 deletion changelog/18.0/18.0.0/summary.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,19 @@
- [Updated to node v18.16.0](#update-node)
- **[Deprecations and Deletions](#deprecations-and-deletions)**
- [Deprecated Flags](#deprecated-flags)
- [Deprecated Stats](#deprecated-stats)
- [Deleted `V3` planner](#deleted-v3)
- [Deleted `k8stopo`](#deleted-k8stopo)
- [Deleted `vtgr`](#deleted-vtgr)
- [Deleted `query_analyzer`](#deleted-query_analyzer)
- [Deprecated VTBackup stat `DurationByPhase`](#deprecated-vtbackup-stat-duration-by-phase)
- [Deprecated VDiff v1](#deprecated-vdiff-v1)
- **[New stats](#new-stats)**
- **[New Stats](#new-stats)**
- [VTGate Vindex unknown parameters](#vtgate-vindex-unknown-parameters)
- [VTBackup stat `Phase`](#vtbackup-stat-phase)
- [VTBackup stat `PhaseStatus`](#vtbackup-stat-phase-status)
- [Backup and restore metrics for AWS S3](#backup-restore-metrics-aws-s3)
- [VTCtld and VTOrc reparenting stats](#vtctld-and-vtorc-reparenting-stats)
- **[VTTablet](#vttablet)**
- [VTTablet: New ResetSequences RPC](#vttablet-new-rpc-reset-sequences)
- **[Docker](#docker)**
Expand Down Expand Up @@ -116,6 +118,15 @@ VTGate flag:

- `--schema_change_signal_user` is deprecated and will be removed in `v19.0`

#### <a id="deprecated-stats"/>Deprecated Stats

The following Emergency Reparent Shard stats are deprecated in `v18.0` and will be removed in `v19.0`:
- `ers_counter`
- `ers_success_counter`
- `ers_failure_counter`

These metrics are replaced by [new reparenting stats introduced in `v18.0`](#vtctld-and-vtorc-reparenting-stats).

#### <a id="deleted-v3"/>Deleted `v3` planner

The `Gen4` planner has been the default planner since Vitess 14. The `v3` planner was deprecated in Vitess 15 and has now been removed in this release.
Expand Down Expand Up @@ -182,6 +193,14 @@ vtbackup_restore_count{component="BackupStorage",implementation="S3",operation="
vtbackup_restore_count{component="BackupStorage",implementation="S3",operation="AWS:Request:Send"} 165
```

#### <a id="vtctld-and-vtorc-reparenting-stats"/>VTCtld and VTOrc reparenting stats

New VTCtld and VTorc stats were added to measure frequency of reparents by keyspace/shard:
- `emergency_reparent_counts` - Number of times Emergency Reparent Shard has been run. It is further subdivided by the keyspace, shard and the result of the operation.
- `planned_reparent_counts` - Number of times Planned Reparent Shard has been run. It is further subdivided by the keyspace, shard and the result of the operation.

Also, the `reparent_shard_operation_timings` stat was added to provide per-operation timings of reparent operations.

### <a id="vttablet"/>VTTablet

#### <a id="vttablet-new-rpc-reset-sequences"/>New ResetSequences rpc
Expand Down
4 changes: 3 additions & 1 deletion go/stats/timings.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,12 @@ func NewTimings(name, help, label string, categories ...string) *Timings {
return t
}

// Reset will clearStats histograms: used during testing
// Reset will clear histograms and counters: used during testing
func (t *Timings) Reset() {
t.mu.RLock()
t.histograms = make(map[string]*Histogram)
t.totalCount.Store(0)
t.totalTime.Store(0)
t.mu.RUnlock()
}

Expand Down
6 changes: 6 additions & 0 deletions go/test/endtoend/vtorc/general/vtorc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ func TestSingleKeyspace(t *testing.T) {
utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true)
utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], keyspace.Name, shard0.Name, 1)
}

// Cases to test:
Expand All @@ -94,6 +95,7 @@ func TestKeyspaceShard(t *testing.T) {
utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true)
utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], keyspace.Name, shard0.Name, 1)
}

// Cases to test:
Expand All @@ -116,6 +118,7 @@ func TestVTOrcRepairs(t *testing.T) {
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

var replica, otherReplica *cluster.Vttablet
for _, tablet := range shard0.Vttablets {
Expand Down Expand Up @@ -344,6 +347,7 @@ func TestVTOrcWithPrs(t *testing.T) {
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

// find any replica tablet other than the current primary
var replica *cluster.Vttablet
Expand Down Expand Up @@ -371,7 +375,9 @@ func TestVTOrcWithPrs(t *testing.T) {
utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
// Verify that VTOrc didn't run any other recovery
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 0)
utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 0)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixPrimaryRecoveryName, 0)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 0)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryHasPrimaryRecoveryName, 0)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ func TestDownPrimary(t *testing.T) {
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

// find the replica and rdonly tablets
var replica, rdonly *cluster.Vttablet
Expand Down Expand Up @@ -99,6 +100,7 @@ func TestDownPrimary(t *testing.T) {
// also check that the replication is working correctly after failover
utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1)
utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
}

// bring down primary before VTOrc has started, let vtorc repair.
Expand Down Expand Up @@ -154,6 +156,7 @@ func TestDownPrimaryBeforeVTOrc(t *testing.T) {
// also check that the replication is working correctly after failover
utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1)
utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
}

// delete the primary record and let vtorc repair.
Expand All @@ -168,6 +171,7 @@ func TestDeletedPrimaryTablet(t *testing.T) {
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

// find the replica and rdonly tablets
var replica, rdonly *cluster.Vttablet
Expand Down Expand Up @@ -239,6 +243,7 @@ func TestDeadPrimaryRecoversImmediately(t *testing.T) {
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

// find the replica and rdonly tablets
var replica, rdonly *cluster.Vttablet
Expand Down Expand Up @@ -275,6 +280,7 @@ func TestDeadPrimaryRecoversImmediately(t *testing.T) {
// also check that the replication is working correctly after failover
utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1)
utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

// Parse log file and find out how much time it took for DeadPrimary to recover.
logFile := path.Join(vtOrcProcess.LogDir, vtOrcProcess.LogFileName)
Expand Down
48 changes: 46 additions & 2 deletions go/test/endtoend/vtorc/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@ import (
"context"
"encoding/json"
"fmt"
"math"
"os"
"os/exec"
"path"
"reflect"
"strconv"
"strings"
"testing"
Expand Down Expand Up @@ -957,15 +959,57 @@ func WaitForSuccessfulRecoveryCount(t *testing.T, vtorcInstance *cluster.VTOrcPr
for time.Since(startTime) < timeout {
vars := vtorcInstance.GetVars()
successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{})
successCount := successfulRecoveriesMap[recoveryName]
successCount := int(math.Round(reflect.ValueOf(successfulRecoveriesMap[recoveryName]).Float()))
if successCount == countExpected {
return
}
time.Sleep(time.Second)
}
vars := vtorcInstance.GetVars()
successfulRecoveriesMap := vars["SuccessfulRecoveries"].(map[string]interface{})
successCount := successfulRecoveriesMap[recoveryName]
successCount := int(math.Round(reflect.ValueOf(successfulRecoveriesMap[recoveryName]).Float()))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Apparently the metric produces floats and not ints, so when we compare successCount == countExpected it was always false. It was only the assert.EqualValues(t, countExpected, successCount) which was passing. This meant that we always waited for 15 seconds whenever we entered this function!
I have fixed this too as part of this PR.

assert.EqualValues(t, countExpected, successCount)
}

// WaitForSuccessfulPRSCount waits until the given keyspace-shard's count of successful prs runs matches the count expected.
func WaitForSuccessfulPRSCount(t *testing.T, vtorcInstance *cluster.VTOrcProcess, keyspace, shard string, countExpected int) {
t.Helper()
timeout := 15 * time.Second
startTime := time.Now()
mapKey := fmt.Sprintf("%v.%v.success", keyspace, shard)
for time.Since(startTime) < timeout {
vars := vtorcInstance.GetVars()
prsCountsMap := vars["planned_reparent_counts"].(map[string]interface{})
successCount := int(math.Round(reflect.ValueOf(prsCountsMap[mapKey]).Float()))
if successCount == countExpected {
return
}
time.Sleep(time.Second)
}
vars := vtorcInstance.GetVars()
prsCountsMap := vars["planned_reparent_counts"].(map[string]interface{})
successCount := int(math.Round(reflect.ValueOf(prsCountsMap[mapKey]).Float()))
assert.EqualValues(t, countExpected, successCount)
}

// WaitForSuccessfulERSCount waits until the given keyspace-shard's count of successful ers runs matches the count expected.
func WaitForSuccessfulERSCount(t *testing.T, vtorcInstance *cluster.VTOrcProcess, keyspace, shard string, countExpected int) {
t.Helper()
timeout := 15 * time.Second
startTime := time.Now()
mapKey := fmt.Sprintf("%v.%v.success", keyspace, shard)
for time.Since(startTime) < timeout {
vars := vtorcInstance.GetVars()
ersCountsMap := vars["emergency_reparent_counts"].(map[string]interface{})
successCount := int(math.Round(reflect.ValueOf(ersCountsMap[mapKey]).Float()))
if successCount == countExpected {
return
}
time.Sleep(time.Second)
}
vars := vtorcInstance.GetVars()
ersCountsMap := vars["emergency_reparent_counts"].(map[string]interface{})
successCount := int(math.Round(reflect.ValueOf(ersCountsMap[mapKey]).Float()))
assert.EqualValues(t, countExpected, successCount)
}

Expand Down
24 changes: 18 additions & 6 deletions go/vt/vtctl/reparentutil/emergency_reparenter.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,14 @@ type EmergencyReparentOptions struct {

// counters for Emergency Reparent Shard
var (
ersCounter = stats.NewGauge("ers_counter", "Number of times Emergency Reparent Shard has been run")
ersSuccessCounter = stats.NewGauge("ers_success_counter", "Number of times Emergency Reparent Shard has succeeded")
ersFailureCounter = stats.NewGauge("ers_failure_counter", "Number of times Emergency Reparent Shard has failed")
// TODO(timvaillancourt): remove legacyERS* gauges in v19+.
legacyERSCounter = stats.NewGauge("ers_counter", "Number of times Emergency Reparent Shard has been run")
legacyERSSuccessCounter = stats.NewGauge("ers_success_counter", "Number of times Emergency Reparent Shard has succeeded")
legacyERSFailureCounter = stats.NewGauge("ers_failure_counter", "Number of times Emergency Reparent Shard has failed")

ersCounter = stats.NewCountersWithMultiLabels("emergency_reparent_counts", "Number of times Emergency Reparent Shard has been run",
[]string{"Keyspace", "Shard", "Result"},
)
)

// NewEmergencyReparenter returns a new EmergencyReparenter object, ready to
Expand Down Expand Up @@ -99,26 +104,33 @@ func NewEmergencyReparenter(ts *topo.Server, tmc tmclient.TabletManagerClient, l
// keyspace and shard.
func (erp *EmergencyReparenter) ReparentShard(ctx context.Context, keyspace string, shard string, opts EmergencyReparentOptions) (*events.Reparent, error) {
var err error
statsLabels := []string{keyspace, shard}

opts.lockAction = erp.getLockAction(opts.NewPrimaryAlias)
// First step is to lock the shard for the given operation, if not already locked
if err = topo.CheckShardLocked(ctx, keyspace, shard); err != nil {
var unlock func(*error)
ctx, unlock, err = erp.ts.LockShard(ctx, keyspace, shard, opts.lockAction)
if err != nil {
ersCounter.Add(append(statsLabels, failureResult), 1)
return nil, err
}
defer unlock(&err)
}

// dispatch success or failure of ERS
startTime := time.Now()
ev := &events.Reparent{}
defer func() {
reparentShardOpTimings.Add("EmergencyReparentShard", time.Since(startTime))
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we track the timings of all operations or success only? 🤔

Copy link
Member

@deepthi deepthi Sep 5, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May as well do all of them.

switch err {
case nil:
ersSuccessCounter.Add(1)
legacyERSSuccessCounter.Add(1)
ersCounter.Add(append(statsLabels, successResult), 1)
event.DispatchUpdate(ev, "finished EmergencyReparentShard")
default:
ersFailureCounter.Add(1)
legacyERSFailureCounter.Add(1)
ersCounter.Add(append(statsLabels, failureResult), 1)
event.DispatchUpdate(ev, "failed EmergencyReparentShard: "+err.Error())
}
}()
Expand All @@ -142,7 +154,7 @@ func (erp *EmergencyReparenter) getLockAction(newPrimaryAlias *topodatapb.Tablet
func (erp *EmergencyReparenter) reparentShardLocked(ctx context.Context, ev *events.Reparent, keyspace, shard string, opts EmergencyReparentOptions) (err error) {
// log the starting of the operation and increment the counter
erp.logger.Infof("will initiate emergency reparent shard in keyspace - %s, shard - %s", keyspace, shard)
ersCounter.Add(1)
legacyERSCounter.Add(1)

var (
stoppedReplicationSnapshot *replicationSnapshot
Expand Down
30 changes: 20 additions & 10 deletions go/vt/vtctl/reparentutil/emergency_reparenter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2733,10 +2733,12 @@ func TestEmergencyReparenter_waitForAllRelayLogsToApply(t *testing.T) {
}
}

func TestEmergencyReparenterCounters(t *testing.T) {
ersCounter.Set(0)
ersSuccessCounter.Set(0)
ersFailureCounter.Set(0)
func TestEmergencyReparenterStats(t *testing.T) {
ersCounter.ResetAll()
legacyERSCounter.Reset()
legacyERSSuccessCounter.Reset()
legacyERSFailureCounter.Reset()
reparentShardOpTimings.Reset()

emergencyReparentOps := EmergencyReparentOptions{}
tmc := &testutil.TabletManagerClient{
Expand Down Expand Up @@ -2865,9 +2867,13 @@ func TestEmergencyReparenterCounters(t *testing.T) {
require.NoError(t, err)

// check the counter values
require.EqualValues(t, 1, ersCounter.Get())
require.EqualValues(t, 1, ersSuccessCounter.Get())
require.EqualValues(t, 0, ersFailureCounter.Get())
require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1}, ersCounter.Counts())
require.EqualValues(t, map[string]int64{"All": 1, "EmergencyReparentShard": 1}, reparentShardOpTimings.Counts())

// check the legacy counter values
require.EqualValues(t, 1, legacyERSCounter.Get())
require.EqualValues(t, 1, legacyERSSuccessCounter.Get())
require.EqualValues(t, 0, legacyERSFailureCounter.Get())

// set emergencyReparentOps to request a non existent tablet
emergencyReparentOps.NewPrimaryAlias = &topodatapb.TabletAlias{
Expand All @@ -2880,9 +2886,13 @@ func TestEmergencyReparenterCounters(t *testing.T) {
require.Error(t, err)

// check the counter values
require.EqualValues(t, 2, ersCounter.Get())
require.EqualValues(t, 1, ersSuccessCounter.Get())
require.EqualValues(t, 1, ersFailureCounter.Get())
require.EqualValues(t, map[string]int64{"testkeyspace.-.success": 1, "testkeyspace.-.failure": 1}, ersCounter.Counts())
require.EqualValues(t, map[string]int64{"All": 2, "EmergencyReparentShard": 2}, reparentShardOpTimings.Counts())

// check the legacy counter values
require.EqualValues(t, 2, legacyERSCounter.Get())
require.EqualValues(t, 1, legacyERSSuccessCounter.Get())
require.EqualValues(t, 1, legacyERSFailureCounter.Get())
}

func TestEmergencyReparenter_findMostAdvanced(t *testing.T) {
Expand Down
Loading