Skip to content

Commit

Permalink
Merge pull request #9258 from planetscale/prs-test-port
Browse files Browse the repository at this point in the history
Use PRS in VtOrc while doing Graceful Primary Takeover
  • Loading branch information
GuptaManan100 authored Nov 22, 2021
2 parents 15c49ba + 574cfee commit 856aef4
Show file tree
Hide file tree
Showing 12 changed files with 397 additions and 160 deletions.
2 changes: 0 additions & 2 deletions docker/mini/orchestrator-vitess-mini.conf.json
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,11 @@
"RecoverMasterClusterFilters": [],
"RecoverIntermediateMasterClusterFilters": [],
"OnFailureDetectionProcesses": [],
"PreGracefulTakeoverProcesses": [],
"PreFailoverProcesses": [],
"PostFailoverProcesses": [],
"PostUnsuccessfulFailoverProcesses": [],
"PostMasterFailoverProcesses": [],
"PostIntermediateMasterFailoverProcesses": [],
"PostGracefulTakeoverProcesses": [],
"CoMasterRecoveryMustPromoteOtherCoMaster": true,
"DetachLostReplicasAfterMasterFailover": true,
"ApplyMySQLPromotionAfterMasterFailover": true,
Expand Down
5 changes: 0 additions & 5 deletions go/test/endtoend/vtorc/general/vtorc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,6 @@ import (

"vitess.io/vitess/go/test/endtoend/vtorc/utils"

_ "vitess.io/vitess/go/vt/topo/consultopo"
_ "vitess.io/vitess/go/vt/topo/etcd2topo"
_ "vitess.io/vitess/go/vt/topo/k8stopo"
_ "vitess.io/vitess/go/vt/topo/zk2topo"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

Expand Down
173 changes: 173 additions & 0 deletions go/test/endtoend/vtorc/gracefultakeover/graceful_takeover_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
/*
Copyright 2021 The Vitess Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package gracefultakeover

import (
"fmt"
"testing"
"time"

"github.com/stretchr/testify/assert"

"vitess.io/vitess/go/test/endtoend/cluster"
"vitess.io/vitess/go/test/endtoend/vtorc/utils"
)

// make an api call to graceful primary takeover and let vtorc fix it
// covers the test case graceful-master-takeover from orchestrator
func TestGracefulPrimaryTakeover(t *testing.T) {
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVtorc(t, clusterInfo, 2, 0, nil, "test_config.json")
keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
shard0 := &keyspace.Shards[0]

// find primary from topo
curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
assert.NotNil(t, curPrimary, "should have elected a primary")

// find the replica tablet
var replica *cluster.Vttablet
for _, tablet := range shard0.Vttablets {
// we know we have only two tablets, so the one not the primary must be the replica
if tablet.Alias != curPrimary.Alias {
replica = tablet
}
}
assert.NotNil(t, replica, "could not find replica tablet")

// check that the replication is setup correctly before we failover
utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica}, 10*time.Second)

status, _ := utils.MakeAPICallUntilRegistered(t, fmt.Sprintf("http://localhost:3000/api/graceful-primary-takeover/localhost/%d/localhost/%d", curPrimary.MySQLPort, replica.MySQLPort))
assert.Equal(t, 200, status)

// check that the replica gets promoted
utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{curPrimary}, 10*time.Second)
}

// make an api call to graceful primary takeover without specifying the primary tablet to promote
// covers the test case graceful-master-takeover-fail-no-target from orchestrator
// orchestrator used to fail in this case, but for VtOrc, specifying no target makes it choose one on its own
func TestGracefulPrimaryTakeoverNoTarget(t *testing.T) {
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVtorc(t, clusterInfo, 2, 0, nil, "test_config.json")
keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
shard0 := &keyspace.Shards[0]

// find primary from topo
curPrimary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
assert.NotNil(t, curPrimary, "should have elected a primary")

// find the replica tablet
var replica *cluster.Vttablet
for _, tablet := range shard0.Vttablets {
// we know we have only two tablets, so the one not the primary must be the replica
if tablet.Alias != curPrimary.Alias {
replica = tablet
}
}
assert.NotNil(t, replica, "could not find the replica tablet")

// check that the replication is setup correctly before we failover
utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica}, 10*time.Second)

status, _ := utils.MakeAPICallUntilRegistered(t, fmt.Sprintf("http://localhost:3000/api/graceful-primary-takeover/localhost/%d/", curPrimary.MySQLPort))
assert.Equal(t, 200, status)

// check that the replica gets promoted
utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{curPrimary}, 10*time.Second)
}

// make an api call to graceful primary takeover auto and let vtorc fix it
// covers the test case graceful-master-takeover-auto from orchestrator
func TestGracefulPrimaryTakeoverAuto(t *testing.T) {
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVtorc(t, clusterInfo, 2, 1, nil, "test_config.json")
keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
shard0 := &keyspace.Shards[0]

// find primary from topo
primary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
assert.NotNil(t, primary, "should have elected a primary")

// find the replica tablet and the rdonly tablet
var replica, rdonly *cluster.Vttablet
for _, tablet := range shard0.Vttablets {
// we know we have only two replcia tablets, so the one not the primary must be the other replica
if tablet.Alias != primary.Alias && tablet.Type == "replica" {
replica = tablet
}
if tablet.Type == "rdonly" {
rdonly = tablet
}
}
assert.NotNil(t, replica, "could not find replica tablet")
assert.NotNil(t, rdonly, "could not find rdonly tablet")

// check that the replication is setup correctly before we failover
utils.CheckReplication(t, clusterInfo, primary, []*cluster.Vttablet{replica, rdonly}, 10*time.Second)

status, _ := utils.MakeAPICallUntilRegistered(t, fmt.Sprintf("http://localhost:3000/api/graceful-primary-takeover-auto/localhost/%d/localhost/%d", primary.MySQLPort, replica.MySQLPort))
assert.Equal(t, 200, status)

// check that the replica gets promoted
utils.CheckPrimaryTablet(t, clusterInfo, replica, true)
utils.VerifyWritesSucceed(t, clusterInfo, replica, []*cluster.Vttablet{primary, rdonly}, 10*time.Second)

status, _ = utils.MakeAPICallUntilRegistered(t, fmt.Sprintf("http://localhost:3000/api/graceful-primary-takeover-auto/localhost/%d/", replica.MySQLPort))
assert.Equal(t, 200, status)

// check that the primary gets promoted back
utils.CheckPrimaryTablet(t, clusterInfo, primary, true)
utils.VerifyWritesSucceed(t, clusterInfo, primary, []*cluster.Vttablet{replica, rdonly}, 10*time.Second)
}

// make an api call to graceful primary takeover with a cross-cell replica and check that it errors out
// covers the test case graceful-master-takeover-fail-cross-region from orchestrator
func TestGracefulPrimaryTakeoverFailCrossCell(t *testing.T) {
defer cluster.PanicHandler(t)
utils.SetupVttabletsAndVtorc(t, clusterInfo, 1, 1, nil, "test_config.json")
keyspace := &clusterInfo.ClusterInstance.Keyspaces[0]
shard0 := &keyspace.Shards[0]

// find primary from topo
primary := utils.ShardPrimaryTablet(t, clusterInfo, keyspace, shard0)
assert.NotNil(t, primary, "should have elected a primary")

// find the rdonly tablet
var rdonly *cluster.Vttablet
for _, tablet := range shard0.Vttablets {
if tablet.Type == "rdonly" {
rdonly = tablet
}
}
assert.NotNil(t, rdonly, "could not find rdonly tablet")

crossCellReplica1 := utils.StartVttablet(t, clusterInfo, utils.Cell2, false)
// newly started tablet does not replicate from anyone yet, we will allow orchestrator to fix this too
utils.CheckReplication(t, clusterInfo, primary, []*cluster.Vttablet{crossCellReplica1, rdonly}, 25*time.Second)

status, response := utils.MakeAPICallUntilRegistered(t, fmt.Sprintf("http://localhost:3000/api/graceful-primary-takeover/localhost/%d/localhost/%d", primary.MySQLPort, crossCellReplica1.MySQLPort))
assert.Equal(t, 500, status)
assert.Contains(t, response, "GracefulPrimaryTakeover: constraint failure")

// check that the cross-cell replica doesn't get promoted and the previous primary is still the primary
utils.CheckPrimaryTablet(t, clusterInfo, primary, true)
utils.VerifyWritesSucceed(t, clusterInfo, primary, []*cluster.Vttablet{crossCellReplica1, rdonly}, 10*time.Second)
}
77 changes: 77 additions & 0 deletions go/test/endtoend/vtorc/gracefultakeover/main_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/*
Copyright 2021 The Vitess Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package gracefultakeover

import (
"fmt"
"os"
"testing"

"vitess.io/vitess/go/test/endtoend/cluster"
"vitess.io/vitess/go/test/endtoend/vtorc/utils"
)

var clusterInfo *utils.VtOrcClusterInfo

func TestMain(m *testing.M) {
// setup cellInfos before creating the cluster
var cellInfos []*utils.CellInfo
cellInfos = append(cellInfos, &utils.CellInfo{
CellName: utils.Cell1,
NumReplicas: 6,
NumRdonly: 2,
UIDBase: 100,
})
cellInfos = append(cellInfos, &utils.CellInfo{
CellName: utils.Cell2,
NumReplicas: 2,
NumRdonly: 0,
UIDBase: 200,
})

exitcode, err := func() (int, error) {
var err error
clusterInfo, err = utils.CreateClusterAndStartTopo(cellInfos)
if err != nil {
return 1, err
}

return m.Run(), nil
}()

cluster.PanicHandler(nil)

// stop vtorc first otherwise its logs get polluted
// with instances being unreachable triggering unnecessary operations
if clusterInfo.ClusterInstance.VtorcProcess != nil {
_ = clusterInfo.ClusterInstance.VtorcProcess.TearDown()
}

for _, cellInfo := range clusterInfo.CellInfos {
utils.KillTablets(cellInfo.ReplicaTablets)
utils.KillTablets(cellInfo.RdonlyTablets)
}
clusterInfo.ClusterInstance.Keyspaces[0].Shards[0].Vttablets = nil
clusterInfo.ClusterInstance.Teardown()

if err != nil {
fmt.Printf("%v\n", err)
os.Exit(1)
} else {
os.Exit(exitcode)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
limitations under the License.
*/

package primaryFailure
package primaryfailure

import (
"fmt"
Expand All @@ -23,11 +23,6 @@ import (

"vitess.io/vitess/go/test/endtoend/vtorc/utils"

_ "vitess.io/vitess/go/vt/topo/consultopo"
_ "vitess.io/vitess/go/vt/topo/etcd2topo"
_ "vitess.io/vitess/go/vt/topo/k8stopo"
_ "vitess.io/vitess/go/vt/topo/zk2topo"

"vitess.io/vitess/go/test/endtoend/cluster"
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
limitations under the License.
*/

package primaryFailure
package primaryfailure

import (
"testing"
Expand Down
39 changes: 39 additions & 0 deletions go/test/endtoend/vtorc/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ package utils
import (
"context"
"fmt"
"io/ioutil"
"net/http"
"os"
"os/exec"
"path"
Expand All @@ -29,6 +31,12 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

// This imports toposervers to register their implementations of TopoServer.
_ "vitess.io/vitess/go/vt/topo/consultopo"
_ "vitess.io/vitess/go/vt/topo/etcd2topo"
_ "vitess.io/vitess/go/vt/topo/k8stopo"
_ "vitess.io/vitess/go/vt/topo/zk2topo"

"vitess.io/vitess/go/json2"
"vitess.io/vitess/go/mysql"
"vitess.io/vitess/go/sqltypes"
Expand Down Expand Up @@ -704,3 +712,34 @@ func CheckSourcePort(t *testing.T, replica *cluster.Vttablet, source *cluster.Vt
time.Sleep(300 * time.Millisecond)
}
}

// MakeAPICall is used make an API call given the url. It returns the status and the body of the response received
func MakeAPICall(t *testing.T, url string) (status int, response string) {
t.Helper()
res, err := http.Get(url)
require.NoError(t, err)
bodyBytes, err := ioutil.ReadAll(res.Body)
require.NoError(t, err)
body := string(bodyBytes)
return res.StatusCode, body
}

// MakeAPICallUntilRegistered is used to make an API call and retry if we see a 500 - no successor promoted output. This happens when some other recovery had previously run
// and the API recovery was unable to be registered due to active timeout period.
func MakeAPICallUntilRegistered(t *testing.T, url string) (status int, response string) {
timeout := time.After(10 * time.Second)
for {
select {
case <-timeout:
t.Fatal("timedout waiting for api to register correctly")
return
default:
status, response = MakeAPICall(t, url)
if status == 500 && strings.Contains(response, "no successor promoted") {
time.Sleep(1 * time.Second)
break
}
return status, response
}
}
}
10 changes: 4 additions & 6 deletions go/vt/orchestrator/app/cli.go
Original file line number Diff line number Diff line change
Expand Up @@ -1200,13 +1200,12 @@ func Cli(command string, strict bool, instance string, destination string, owner
if destinationKey != nil {
validateInstanceIsFound(destinationKey)
}
topologyRecovery, promotedPrimaryCoordinates, err := logic.GracefulPrimaryTakeover(clusterName, destinationKey, false)
topologyRecovery, err := logic.GracefulPrimaryTakeover(clusterName, destinationKey)
if err != nil {
log.Fatale(err)
}
fmt.Println(topologyRecovery.SuccessorKey.DisplayString())
fmt.Println(*promotedPrimaryCoordinates)
log.Debugf("Promoted %+v as new primary. Binlog coordinates at time of promotion: %+v", topologyRecovery.SuccessorKey, *promotedPrimaryCoordinates)
log.Debugf("Promoted %+v as new primary.", topologyRecovery.SuccessorKey)
}
case registerCliCommand("graceful-primary-takeover-auto", "Recovery", `Gracefully promote a new primary. orchestrator will attempt to pick the promoted replica automatically`):
{
Expand All @@ -1216,13 +1215,12 @@ func Cli(command string, strict bool, instance string, destination string, owner
if destinationKey != nil {
validateInstanceIsFound(destinationKey)
}
topologyRecovery, promotedPrimaryCoordinates, err := logic.GracefulPrimaryTakeover(clusterName, destinationKey, true)
topologyRecovery, err := logic.GracefulPrimaryTakeover(clusterName, destinationKey)
if err != nil {
log.Fatale(err)
}
fmt.Println(topologyRecovery.SuccessorKey.DisplayString())
fmt.Println(*promotedPrimaryCoordinates)
log.Debugf("Promoted %+v as new primary. Binlog coordinates at time of promotion: %+v", topologyRecovery.SuccessorKey, *promotedPrimaryCoordinates)
log.Debugf("Promoted %+v as new primary.", topologyRecovery.SuccessorKey)
}
case registerCliCommand("replication-analysis", "Recovery", `Request an analysis of potential crash incidents in all known topologies`):
{
Expand Down
Loading

0 comments on commit 856aef4

Please sign in to comment.