diff --git a/agent/consul/autopilot/autopilot.go b/agent/consul/autopilot/autopilot.go index b50252ce510c..5b449e24ff1d 100644 --- a/agent/consul/autopilot/autopilot.go +++ b/agent/consul/autopilot/autopilot.go @@ -222,7 +222,9 @@ func (a *Autopilot) pruneDeadServers() error { // Only do removals if a minority of servers will be affected. peers := NumPeers(raftConfig) - if removalCount < peers/2 { + // For failure tolerance of F we need n = 2F+1 servers. + // This means we can safely remove up to (n-1)/2 servers + if removalCount <= (peers-1)/2 { for _, node := range failed { a.logger.Printf("[INFO] autopilot: Attempting removal of failed server node %q", node) go serfLAN.RemoveFailedNode(node) diff --git a/agent/consul/autopilot_test.go b/agent/consul/autopilot_test.go index ea88dcca3205..b6ae4ffeb463 100644 --- a/agent/consul/autopilot_test.go +++ b/agent/consul/autopilot_test.go @@ -9,6 +9,7 @@ import ( "github.com/hashicorp/consul/testutil/retry" "github.com/hashicorp/raft" "github.com/hashicorp/serf/serf" + "github.com/stretchr/testify/require" ) func TestAutopilot_IdempotentShutdown(t *testing.T) { @@ -36,7 +37,7 @@ func testCleanupDeadServer(t *testing.T, raftVersion int) { conf := func(c *Config) { c.Datacenter = "dc1" c.Bootstrap = false - c.BootstrapExpect = 3 + c.BootstrapExpect = 5 c.RaftConfig.ProtocolVersion = raft.ProtocolVersion(raftVersion) } dir1, s1 := testServerWithConfig(t, conf) @@ -51,23 +52,49 @@ func testCleanupDeadServer(t *testing.T, raftVersion int) { defer os.RemoveAll(dir3) defer s3.Shutdown() - servers := []*Server{s1, s2, s3} + dir4, s4 := testServerWithConfig(t, conf) + defer os.RemoveAll(dir4) + defer s4.Shutdown() + + dir5, s5 := testServerWithConfig(t, conf) + defer os.RemoveAll(dir5) + defer s5.Shutdown() + + servers := []*Server{s1, s2, s3, s4, s5} // Try to join joinLAN(t, s2, s1) joinLAN(t, s3, s1) + joinLAN(t, s4, s1) + joinLAN(t, s5, s1) for _, s := range servers { - retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) }) + retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 5)) }) } - // Bring up a new server - dir4, s4 := testServerWithConfig(t, conf) - defer os.RemoveAll(dir4) - defer s4.Shutdown() + require := require.New(t) + testrpc.WaitForLeader(t, s1.RPC, "dc1") + leaderIndex := -1 + for i, s := range servers { + if s.IsLeader() { + leaderIndex = i + break + } + } + require.NotEqual(leaderIndex, -1) + + // Shutdown two non-leader servers + killed := make(map[string]struct{}) + for i, s := range servers { + if i != leaderIndex { + s.Shutdown() + killed[string(s.config.NodeID)] = struct{}{} + } + if len(killed) == 2 { + break + } + } - // Kill a non-leader server - s3.Shutdown() retry.Run(t, func(r *retry.R) { alive := 0 for _, m := range s1.LANMembers() { @@ -75,18 +102,17 @@ func testCleanupDeadServer(t *testing.T, raftVersion int) { alive++ } } - if alive != 2 { - r.Fatal(nil) + if alive != 3 { + r.Fatal("Expected three alive servers") } }) - // Join the new server - joinLAN(t, s4, s1) - servers[2] = s4 - - // Make sure the dead server is removed and we're back to 3 total peers + // Make sure the dead servers are removed and we're back to 3 total peers for _, s := range servers { - retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) }) + _, killed := killed[string(s.config.NodeID)] + if !killed { + retry.Run(t, func(r *retry.R) { r.Check(wantPeers(s, 3)) }) + } } }