Skip to content

Commit

Permalink
introduce func with explanation
Browse files Browse the repository at this point in the history
  • Loading branch information
hanshasselberg committed Dec 12, 2019
1 parent 81dc1d3 commit 86220e8
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 35 deletions.
73 changes: 38 additions & 35 deletions agent/consul/autopilot/autopilot.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,24 @@ func (a *Autopilot) RemoveDeadServers() {
}
}

func canRemoveServers(peers, minQuorum, deadServers int) (bool, string) {
if deadServers == 0 {
return false, "nothing to do"
}

if peers-deadServers < int(minQuorum) {
return false, fmt.Sprintf("denied, because removing %d/%d servers would leave less then minimal allowed quorum of %d servers", deadServers, peers, minQuorum)
}

// Only do removals if a minority of servers will be affected.
// For failure tolerance of F we need n = 2F+1 servers.
// This means we can safely remove up to (n-1)/2 servers.
if deadServers > (peers-1)/2 {
return false, fmt.Sprintf("denied, because removing the majority of servers %d/%d is not safe", deadServers, peers)
}
return true, fmt.Sprintf("allowed, because removing %d/%d servers leaves a majority of servers above the minimal allowed quorum %d", deadServers, peers, minQuorum)
}

// pruneDeadServers removes up to numPeers/2 failed servers
func (a *Autopilot) pruneDeadServers() error {
conf := a.delegate.AutopilotConfig()
Expand Down Expand Up @@ -226,50 +244,35 @@ func (a *Autopilot) pruneDeadServers() error {
}
}

// We can bail early if there's nothing to do.
removalCount := len(failed) + len(staleRaftServers)
if removalCount == 0 {
if ok, msg := canRemoveServers(NumPeers(raftConfig), int(conf.MinQuorum), len(failed)+len(staleRaftServers)); !ok {
a.logger.Printf("[DEBUG] autopilot: Failed to remove dead servers: %s.", msg)
return nil
}

peers := NumPeers(raftConfig)
for _, node := range failed {
a.logger.Printf("[INFO] autopilot: Attempting removal of failed server node %q", node.Name)
go serfLAN.RemoveFailedNode(node.Name)
if serfWAN != nil {
go serfWAN.RemoveFailedNode(fmt.Sprintf("%s.%s", node.Name, node.Tags["dc"]))
}

if peers-removalCount < int(conf.MinQuorum) {
a.logger.Printf("[DEBUG] autopilot: Failed to remove dead servers: removing %d would be less then quorum of %d.", removalCount, conf.MinQuorum)
return nil
}

// Only do removals if a minority of servers will be affected.
// For failure tolerance of F we need n = 2F+1 servers.
// This means we can safely remove up to (n-1)/2 servers.
if removalCount <= (peers-1)/2 {
for _, node := range failed {
a.logger.Printf("[INFO] autopilot: Attempting removal of failed server node %q", node.Name)
go serfLAN.RemoveFailedNode(node.Name)
if serfWAN != nil {
go serfWAN.RemoveFailedNode(fmt.Sprintf("%s.%s", node.Name, node.Tags["dc"]))
}

minRaftProtocol, err := a.MinRaftProtocol()
if err != nil {
return err
}
for _, raftServer := range staleRaftServers {
a.logger.Printf("[INFO] autopilot: Attempting removal of stale %s", fmtServer(raftServer))
var future raft.Future
if minRaftProtocol >= 2 {
future = raftNode.RemoveServer(raftServer.ID, 0, 0)
} else {
future = raftNode.RemovePeer(raftServer.Address)
}

minRaftProtocol, err := a.MinRaftProtocol()
if err != nil {
if err := future.Error(); err != nil {
return err
}
for _, raftServer := range staleRaftServers {
a.logger.Printf("[INFO] autopilot: Attempting removal of stale %s", fmtServer(raftServer))
var future raft.Future
if minRaftProtocol >= 2 {
future = raftNode.RemoveServer(raftServer.ID, 0, 0)
} else {
future = raftNode.RemovePeer(raftServer.Address)
}
if err := future.Error(); err != nil {
return err
}
}
} else {
a.logger.Printf("[DEBUG] autopilot: Failed to remove dead servers: too many dead servers: %d/%d", removalCount, peers)
}

return nil
Expand Down
25 changes: 25 additions & 0 deletions agent/consul/autopilot/autopilot_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"testing"

"github.com/hashicorp/serf/serf"
"github.com/stretchr/testify/require"
)

func TestMinRaftProtocol(t *testing.T) {
Expand Down Expand Up @@ -84,3 +85,27 @@ func TestMinRaftProtocol(t *testing.T) {
}
}
}

func TestAutopilot_canRemoveServers(t *testing.T) {
type test struct {
peers int
minQuorum int
deadServers int
ok bool
}

tests := []test{
{1, 1, 1, false},
{3, 3, 1, false},
{4, 3, 3, false},
{5, 3, 3, false},
{5, 3, 2, true},
{5, 3, 1, true},
{9, 3, 5, false},
}
for _, test := range tests {
ok, msg := canRemoveServers(test.peers, test.minQuorum, test.deadServers)
require.Equal(t, test.ok, ok)
t.Logf("%+v: %s", test, msg)
}
}

0 comments on commit 86220e8

Please sign in to comment.