Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

slack-vitess-r15.0.5: vtctld/vtorc: improve reparenting stats (#13723) #143

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion go/stats/timings.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,12 @@ func NewTimings(name, help, label string, categories ...string) *Timings {
return t
}

// Reset will clear histograms: used during testing
// Reset will clear histograms and counters: used during testing
func (t *Timings) Reset() {
t.mu.RLock()
t.histograms = make(map[string]*Histogram)
t.totalCount.Set(0)
t.totalTime.Set(0)
t.mu.RUnlock()
}

Expand Down
6 changes: 6 additions & 0 deletions go/test/endtoend/vtorc/general/vtorc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ func TestSingleKeyspace(t *testing.T) {
utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true)
utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], keyspace.Name, shard0.Name, 1)
}

// Cases to test:
Expand All @@ -94,6 +95,7 @@ func TestKeyspaceShard(t *testing.T) {
utils.CheckPrimaryTablet(t, clusterInfo, shard0.Vttablets[0], true)
utils.CheckReplication(t, clusterInfo, shard0.Vttablets[0], shard0.Vttablets[1:], 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, clusterInfo.ClusterInstance.VTOrcProcesses[0], keyspace.Name, shard0.Name, 1)
}

// Cases to test:
Expand All @@ -116,6 +118,7 @@ func TestVTOrcRepairs(t *testing.T) {
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

var replica, otherReplica *cluster.Vttablet
for _, tablet := range shard0.Vttablets {
Expand Down Expand Up @@ -335,6 +338,7 @@ func TestVTOrcWithPrs(t *testing.T) {
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

// find any replica tablet other than the current primary
var replica *cluster.Vttablet
Expand Down Expand Up @@ -362,7 +366,9 @@ func TestVTOrcWithPrs(t *testing.T) {
utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
// Verify that VTOrc didn't run any other recovery
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 0)
utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 0)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixPrimaryRecoveryName, 0)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.FixReplicaRecoveryName, 0)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverPrimaryHasPrimaryRecoveryName, 0)
Expand Down
233 changes: 233 additions & 0 deletions go/test/endtoend/vtorc/primaryfailure/primary_failure_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ limitations under the License.
package primaryfailure

import (
"bufio"
"fmt"
"os"
"path"
"regexp"
"strings"
"testing"
"time"

Expand Down Expand Up @@ -47,6 +53,7 @@ func TestDownPrimary(t *testing.T) {
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

// find the replica and rdonly tablets
var replica, rdonly *cluster.Vttablet
Expand Down Expand Up @@ -85,6 +92,206 @@ func TestDownPrimary(t *testing.T) {
// also check that the replication is working correctly after failover
utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1)
utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
}

// bring down primary before VTOrc has started, let vtorc repair.
func TestDownPrimaryBeforeVTOrc(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, nil, cluster.VTOrcConfiguration{}, 0, "none")
keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
shard0 := &keyspace.Shards[0]
curPrimary := shard0.Vttablets[0]

// Promote the first tablet as the primary
err := clusterInfo.ClusterInstance.VtctlclientProcess.InitializeShard(keyspace.Name, shard0.Name, clusterInfo.ClusterInstance.Cell, curPrimary.TabletUID)
require.NoError(t, err)

// find the replica and rdonly tablets
var replica, rdonly *cluster.Vttablet
for _, tablet := range shard0.Vttablets {
// we know we have only two replcia tablets, so the one not the primary must be the other replica
if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" {
replica = tablet
}
if tablet.Type == "rdonly" {
rdonly = tablet
}
}
assert.NotNil(t, replica, "could not find replica tablet")
assert.NotNil(t, rdonly, "could not find rdonly tablet")

// check that the replication is setup correctly before we failover
utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, replica}, 10*time.Second)

// Make the current primary vttablet unavailable.
_ = curPrimary.VttabletProcess.TearDown()
err = curPrimary.MysqlctlProcess.Stop()
require.NoError(t, err)

// Start a VTOrc instance
utils.StartVTOrcs(t, clusterInfo, []string{"--remote_operation_timeout=10s"}, cluster.VTOrcConfiguration{
PreventCrossDataCenterPrimaryFailover: true,
}, 1)

vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]

defer func() {
// we remove the tablet from our global list
utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary)
}()

// check that the replica gets promoted
utils.CheckPrimaryTablet(t, clusterInfo, replica, true)

// also check that the replication is working correctly after failover
utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1)
utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)
}

// delete the primary record and let vtorc repair.
func TestDeletedPrimaryTablet(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s"}, cluster.VTOrcConfiguration{}, 1, "none")
keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
shard0 := &keyspace.Shards[0]
// find primary from topo
curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

// find the replica and rdonly tablets
var replica, rdonly *cluster.Vttablet
for _, tablet := range shard0.Vttablets {
// we know we have only two replcia tablets, so the one not the primary must be the other replica
if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" {
replica = tablet
}
if tablet.Type == "rdonly" {
rdonly = tablet
}
}
assert.NotNil(t, replica, "could not find replica tablet")
assert.NotNil(t, rdonly, "could not find rdonly tablet")

// check that the replication is setup correctly before we failover
utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, rdonly}, 10*time.Second)

// use vtctlclient to stop replication on the replica
_, err := clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("StopReplication", replica.Alias)
require.NoError(t, err)
// insert a write that is not available on the replica.
utils.VerifyWritesSucceed(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly}, 10*time.Second)

// Make the current primary vttablet unavailable and delete its tablet record.
_ = curPrimary.VttabletProcess.TearDown()
err = curPrimary.MysqlctlProcess.Stop()
require.NoError(t, err)
// use vtctlclient to start replication on the replica back
_, err = clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommandWithOutput("StartReplication", replica.Alias)
require.NoError(t, err)
err = clusterInfo.ClusterInstance.VtctldClientProcess.ExecuteCommand("DeleteTablets", "--allow-primary", curPrimary.Alias)
require.NoError(t, err)

defer func() {
// we remove the tablet from our global list
utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary)
}()

// check that the replica gets promoted. Also verify that it has all the writes.
utils.CheckPrimaryTablet(t, clusterInfo, replica, true)

// also check that the replication is working correctly after failover
utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{rdonly}, 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
}

// TestDeadPrimaryRecoversImmediately test Vtorc ability to recover immediately if primary is dead.
// Reason is, unlike other recoveries, in DeadPrimary we don't call DiscoverInstance since we know
// that primary is unreachable. This help us save few seconds depending on value of `RemoteOperationTimeout` flag.
func TestDeadPrimaryRecoversImmediately(t *testing.T) {
defer utils.PrintVTOrcLogsOnFailure(t, clusterInfo.ClusterInstance)
defer cluster.PanicHandler(t)
// We specify the --wait-replicas-timeout to a small value because we spawn a cross-cell replica later in the test.
// If that replica is more advanced than the same-cell-replica, then we try to promote the cross-cell replica as an intermediate source.
// If we don't specify a small value of --wait-replicas-timeout, then we would end up waiting for 30 seconds for the dead-primary to respond, failing this test.
utils.SetupVttabletsAndVTOrcs(t, clusterInfo, 2, 1, []string{"--remote_operation_timeout=10s", "--wait-replicas-timeout=5s"}, cluster.VTOrcConfiguration{
PreventCrossDataCenterPrimaryFailover: true,
}, 1, "semi_sync")
keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
shard0 := &keyspace.Shards[0]
// find primary from topo
curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
assert.NotNil(t, curPrimary, "should have elected a primary")
vtOrcProcess := clusterInfo.ClusterInstance.VTOrcProcesses[0]
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.ElectNewPrimaryRecoveryName, 1)
utils.WaitForSuccessfulPRSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

// find the replica and rdonly tablets
var replica, rdonly *cluster.Vttablet
for _, tablet := range shard0.Vttablets {
// we know we have only two replcia tablets, so the one not the primary must be the other replica
if tablet.Alias != curPrimary.Alias && tablet.Type == "replica" {
replica = tablet
}
if tablet.Type == "rdonly" {
rdonly = tablet
}
}
assert.NotNil(t, replica, "could not find replica tablet")
assert.NotNil(t, rdonly, "could not find rdonly tablet")

// Start a cross-cell replica
crossCellReplica := utils.StartVttablet(t, clusterInfo, utils.Cell2, false)

// check that the replication is setup correctly before we failover
utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{rdonly, replica, crossCellReplica}, 10*time.Second)

// Make the current primary vttablet unavailable.
err := curPrimary.MysqlctlProcess.Stop()
require.NoError(t, err)
defer func() {
// we remove the tablet from our global list
utils.PermanentlyRemoveVttablet(clusterInfo, curPrimary)
}()

// check that the replica gets promoted
utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
utils.WaitForInstancePollSecondsExceededCount(t, vtOrcProcess, "InstancePollSecondsExceeded", 2, false)
// also check that the replication is working correctly after failover
utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica}, 10*time.Second)
utils.WaitForSuccessfulRecoveryCount(t, vtOrcProcess, logic.RecoverDeadPrimaryRecoveryName, 1)
utils.WaitForSuccessfulERSCount(t, vtOrcProcess, keyspace.Name, shard0.Name, 1)

// Parse log file and find out how much time it took for DeadPrimary to recover.
logFile := path.Join(vtOrcProcess.LogDir, vtOrcProcess.LogFileName)
// log prefix printed at the end of analysis where we conclude we have DeadPrimary
t1 := extractTimeFromLog(t, logFile, "Proceeding with DeadPrimary recovery")
// log prefix printed at the end of recovery
t2 := extractTimeFromLog(t, logFile, "auditType:RecoverDeadPrimary")
curr := time.Now().Format("2006-01-02")
timeLayout := "2006-01-02 15:04:05.000000"
timeStr1 := fmt.Sprintf("%s %s", curr, t1)
timeStr2 := fmt.Sprintf("%s %s", curr, t2)
time1, err := time.Parse(timeLayout, timeStr1)
if err != nil {
t.Errorf("unable to parse time %s", err.Error())
}
time2, err := time.Parse(timeLayout, timeStr2)
if err != nil {
t.Errorf("unable to parse time %s", err.Error())
}
diff := time2.Sub(time1)
fmt.Printf("The difference between %s and %s is %v seconds.\n", t1, t2, diff.Seconds())
// assert that it takes less than `remote_operation_timeout` to recover from `DeadPrimary`
// use the value provided in `remote_operation_timeout` flag to compare with.
// We are testing against 9.5 seconds to be safe and prevent flakiness.
assert.Less(t, diff.Seconds(), 9.5)
}

// Failover should not be cross data centers, according to the configuration file
Expand Down Expand Up @@ -571,3 +778,29 @@ func TestDownPrimaryPromotionRuleWithLagCrossCenter(t *testing.T) {
// check that rdonly and crossCellReplica are able to replicate from the replica
utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{crossCellReplica, rdonly}, 15*time.Second)
}

func extractTimeFromLog(t *testing.T, logFile string, logStatement string) string {
file, err := os.Open(logFile)
if err != nil {
t.Errorf("fail to extract time from log statement %s", err.Error())
}
defer file.Close()

scanner := bufio.NewScanner(file)

for scanner.Scan() {
line := scanner.Text()
if strings.Contains(line, logStatement) {
// Regular expression pattern for date format
pattern := `\d{2}:\d{2}:\d{2}\.\d{6}`
re := regexp.MustCompile(pattern)
match := re.FindString(line)
return match
}
}

if err := scanner.Err(); err != nil {
t.Errorf("fail to extract time from log statement %s", err.Error())
}
return ""
}
Loading
Loading