diff --git a/nomad/leader.go b/nomad/leader.go index a1a65b05f25..119e72ebfdd 100644 --- a/nomad/leader.go +++ b/nomad/leader.go @@ -727,11 +727,6 @@ func (s *Server) reconcileMember(member serf.Member) error { } defer metrics.MeasureSince([]string{"nomad", "leader", "reconcileMember"}, time.Now()) - // Do not reconcile ourself - if member.Name == fmt.Sprintf("%s.%s", s.config.NodeName, s.config.Region) { - return nil - } - var err error switch member.Status { case serf.StatusAlive: @@ -768,12 +763,6 @@ func (s *Server) reconcileJobSummaries() error { // addRaftPeer is used to add a new Raft peer when a Nomad server joins func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error { - // Do not join ourselfs - if m.Name == s.config.NodeName { - s.logger.Printf("[DEBUG] nomad: adding self (%q) as raft peer skipped", m.Name) - return nil - } - // Check for possibility of multiple bootstrap nodes members := s.serf.Members() if parts.Bootstrap { @@ -786,17 +775,19 @@ func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error { } } - // See if it's already in the configuration. It's harmless to re-add it - // but we want to avoid doing that if possible to prevent useless Raft - // log entries. + // Processing ourselves could result in trying to remove ourselves to + // fix up our address, which would make us step down. This is only + // safe to attempt if there are multiple servers available. addr := (&net.TCPAddr{IP: m.Addr, Port: parts.Port}).String() configFuture := s.raft.GetConfiguration() if err := configFuture.Error(); err != nil { s.logger.Printf("[ERR] nomad: failed to get raft configuration: %v", err) return err } - for _, server := range configFuture.Configuration().Servers { - if server.Address == raft.ServerAddress(addr) { + + if m.Name == s.config.NodeName { + if l := len(configFuture.Configuration().Servers); l < 3 { + s.logger.Printf("[DEBUG] consul: Skipping self join check for %q since the cluster is too small", m.Name) return nil } } @@ -817,7 +808,7 @@ func (s *Server) addRaftPeer(m serf.Member, parts *serverParts) error { // If the address or ID matches an existing server, see if we need to remove the old one first if server.Address == raft.ServerAddress(addr) || server.ID == raft.ServerID(parts.ID) { - // Exit with no-op if this is being called on an existing server + // Exit with no-op if this is being called on an existing server and both the ID and address match if server.Address == raft.ServerAddress(addr) && server.ID == raft.ServerID(parts.ID) { return nil } diff --git a/nomad/leader_test.go b/nomad/leader_test.go index 926e12e1dd3..2475ab84487 100644 --- a/nomad/leader_test.go +++ b/nomad/leader_test.go @@ -3,6 +3,7 @@ package nomad import ( "errors" "fmt" + "strconv" "testing" "time" @@ -13,6 +14,7 @@ import ( "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/testutil" "github.com/hashicorp/raft" + "github.com/hashicorp/serf/serf" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -1112,3 +1114,88 @@ func TestLeader_RevokeLeadership_MultipleTimes(t *testing.T) { require.Nil(t, s1.revokeLeadership()) require.Nil(t, s1.revokeLeadership()) } + +// Test doing an inplace upgrade on a server from raft protocol 2 to 3 +// This verifies that removing the server and adding it back with a uuid works +// even if the server's address stays the same. +func TestServer_ReconcileMember(t *testing.T) { + // Create a three node cluster + t.Parallel() + s1 := TestServer(t, func(c *Config) { + c.DevDisableBootstrap = true + c.RaftConfig.ProtocolVersion = 3 + }) + defer s1.Shutdown() + + s2 := TestServer(t, func(c *Config) { + c.DevDisableBootstrap = true + c.RaftConfig.ProtocolVersion = 3 + }) + defer s2.Shutdown() + + s3 := TestServer(t, func(c *Config) { + c.DevDisableBootstrap = true + c.RaftConfig.ProtocolVersion = 2 + }) + defer s3.Shutdown() + TestJoin(t, s1, s2, s3) + testutil.WaitForLeader(t, s1.RPC) + + // Create a memberlist object for s3, with raft protocol upgraded to 3 + upgradedS3Member := serf.Member{ + Name: s3.config.NodeName, + Addr: s3.config.RPCAddr.IP, + Status: serf.StatusAlive, + Tags: make(map[string]string), + } + upgradedS3Member.Tags["role"] = "nomad" + upgradedS3Member.Tags["id"] = s3.config.NodeID + upgradedS3Member.Tags["region"] = s3.config.Region + upgradedS3Member.Tags["dc"] = s3.config.Datacenter + upgradedS3Member.Tags["rpc_addr"] = "127.0.0.1" + upgradedS3Member.Tags["port"] = strconv.Itoa(s3.config.RPCAddr.Port) + upgradedS3Member.Tags["build"] = "0.8.0" + upgradedS3Member.Tags["vsn"] = "2" + upgradedS3Member.Tags["mvn"] = "1" + upgradedS3Member.Tags["raft_vsn"] = "3" + + // Find the leader so that we can call reconcile member on it + var leader *Server + for _, s := range []*Server{s1, s2, s3} { + if s.IsLeader() { + leader = s + } + } + leader.reconcileMember(upgradedS3Member) + // This should remove s3 from the config and potentially cause a leader election + testutil.WaitForLeader(t, s1.RPC) + + // Figure out the new leader and call reconcile again, this should add s3 with the new ID format + for _, s := range []*Server{s1, s2, s3} { + if s.IsLeader() { + leader = s + } + } + leader.reconcileMember(upgradedS3Member) + testutil.WaitForLeader(t, s1.RPC) + future := s2.raft.GetConfiguration() + if err := future.Error(); err != nil { + t.Fatal(err) + } + addrs := 0 + ids := 0 + for _, server := range future.Configuration().Servers { + if string(server.ID) == string(server.Address) { + addrs++ + } else { + ids++ + } + } + // After this, all three servers should have IDs in raft + if got, want := addrs, 0; got != want { + t.Fatalf("got %d server addresses want %d", got, want) + } + if got, want := ids, 3; got != want { + t.Fatalf("got %d server ids want %d", got, want) + } +} diff --git a/website/source/docs/upgrade/upgrade-specific.html.md b/website/source/docs/upgrade/upgrade-specific.html.md index 4409cfe4a08..03bdec99688 100644 --- a/website/source/docs/upgrade/upgrade-specific.html.md +++ b/website/source/docs/upgrade/upgrade-specific.html.md @@ -29,7 +29,7 @@ to match the default. The Raft protocol must be stepped up in this way; only adjacent version numbers are compatible (for example, version 1 cannot talk to version 3). Here is a table of the -Raft Protocol versions supported by each Consul version: +Raft Protocol versions supported by each Nomad version: @@ -53,6 +53,17 @@ Raft Protocol versions supported by each Consul version: In order to enable all [Autopilot](/guides/cluster/autopilot.html) features, all servers in a Nomad cluster must be running with Raft protocol version 3 or later. +#### Upgrading to Raft Protocol 3 + +This section provides details on upgrading to Raft Protocol 3 in Nomad 0.8 and higher. Raft protocol version 3 requires Nomad running 0.8.0 or newer on all servers in order to work. See [Raft Protocol Version Compatibility](/docs/upgrade/upgrade-specific.html#raft-protocol-version-compatibility) for more details. Also the format of `peers.json` used for outage recovery is different when running with the latest Raft protocol. See [Manual Recovery Using peers.json](/guides/outage.html#manual-recovery-using-peers-json) for a description of the required format. + +Please note that the Raft protocol is different from Nomad's internal protocol as shown in commands like `nomad server members`. To see the version of the Raft protocol in use on each server, use the `nomad operator raft list-peers` command. + +The easiest way to upgrade servers is to have each server leave the cluster, upgrade its `raft_protocol` version in the `server` stanza, and then add it back. Make sure the new server joins successfully and that the cluster is stable before rolling the upgrade forward to the next server. It's also possible to stand up a new set of servers, and then slowly stand down each of the older servers in a similar fashion. + +When using Raft protocol version 3, servers are identified by their `node-id` instead of their IP address when Nomad makes changes to its internal Raft quorum configuration. This means that once a cluster has been upgraded with servers all running Raft protocol version 3, it will no longer allow servers running any older Raft protocol versions to be added. If running a single Nomad server, restarting it in-place will result in that server not being able to elect itself as a leader. To avoid this, either set the Raft protocol back to 2, or use [Manual Recovery Using peers.json](/docs/guides/outage.html#manual-recovery-using-peers-json) to map the server to its node ID in the Raft quorum configuration. + + ### Node Draining Improvements Node draining via the [`node drain`][drain-cli] command or the [drain diff --git a/website/source/guides/outage.html.markdown b/website/source/guides/outage.html.markdown index 4f936dd3b1f..e698d0dc59b 100644 --- a/website/source/guides/outage.html.markdown +++ b/website/source/guides/outage.html.markdown @@ -186,3 +186,39 @@ nomad-server01.global 10.10.11.5:4647 10.10.11.5:4647 follower true nomad-server02.global 10.10.11.6:4647 10.10.11.6:4647 leader true nomad-server03.global 10.10.11.7:4647 10.10.11.7:4647 follower true ``` + +## Peers.json Format Changes in Raft Protocol 3 +For Raft protocol version 3 and later, peers.json should be formatted as a JSON +array containing the node ID, address:port, and suffrage information of each +Nomad server in the cluster, like this: + +``` +[ + { + "id": "adf4238a-882b-9ddc-4a9d-5b6758e4159e", + "address": "10.1.0.1:8300", + "non_voter": false + }, + { + "id": "8b6dda82-3103-11e7-93ae-92361f002671", + "address": "10.1.0.2:8300", + "non_voter": false + }, + { + "id": "97e17742-3103-11e7-93ae-92361f002671", + "address": "10.1.0.3:8300", + "non_voter": false + } +] +``` + +- `id` `(string: )` - Specifies the `node ID` + of the server. This can be found in the logs when the server starts up, + and it can also be found inside the `node-id` file in the server's data directory. + +- `address` `(string: )` - Specifies the IP and port of the server. The port is the + server's RPC port used for cluster communications. + +- `non_voter` `(bool: )` - This controls whether the server is a non-voter, which is used + in some advanced [Autopilot](/guides/cluster/autopilot.html) configurations. If omitted, it will + default to false, which is typical for most clusters.