Skip to content

Commit

Permalink
Merge pull request #8082 from hashicorp/f-raft-multipler
Browse files Browse the repository at this point in the history
Implement raft multipler flag
  • Loading branch information
Mahmood Ali authored Jun 19, 2020
2 parents 8fc76f5 + 744539c commit 0821c0a
Show file tree
Hide file tree
Showing 8 changed files with 161 additions and 4 deletions.
20 changes: 20 additions & 0 deletions command/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,14 @@ const (
// roles used in identifying Consul entries for Nomad agents
consulRoleServer = "server"
consulRoleClient = "client"

// DefaultRaftMultiplier is used as a baseline Raft configuration that
// will be reliable on a very basic server.
DefaultRaftMultiplier = 1

// MaxRaftMultiplier is a fairly arbitrary upper bound that limits the
// amount of performance detuning that's possible.
MaxRaftMultiplier = 10
)

// Agent is a long running daemon that is used to run both
Expand Down Expand Up @@ -180,6 +188,18 @@ func convertServerConfig(agentConfig *Config) (*nomad.Config, error) {
if agentConfig.Server.RaftProtocol != 0 {
conf.RaftConfig.ProtocolVersion = raft.ProtocolVersion(agentConfig.Server.RaftProtocol)
}
raftMultiplier := int(DefaultRaftMultiplier)
if agentConfig.Server.RaftMultiplier != nil && *agentConfig.Server.RaftMultiplier != 0 {
raftMultiplier = *agentConfig.Server.RaftMultiplier
if raftMultiplier < 1 || raftMultiplier > MaxRaftMultiplier {
return nil, fmt.Errorf("raft_multiplier cannot be %d. Must be between 1 and %d", *agentConfig.Server.RaftMultiplier, MaxRaftMultiplier)
}
}
conf.RaftConfig.ElectionTimeout *= time.Duration(raftMultiplier)
conf.RaftConfig.HeartbeatTimeout *= time.Duration(raftMultiplier)
conf.RaftConfig.LeaderLeaseTimeout *= time.Duration(raftMultiplier)
conf.RaftConfig.CommitTimeout *= time.Duration(raftMultiplier)

if agentConfig.Server.NumSchedulers != nil {
conf.NumSchedulers = *agentConfig.Server.NumSchedulers
}
Expand Down
108 changes: 108 additions & 0 deletions command/agent/agent_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package agent

import (
"fmt"
"io/ioutil"
"os"
"path/filepath"
Expand Down Expand Up @@ -362,6 +363,113 @@ func TestAgent_ServerConfig_Limits_OK(t *testing.T) {
}
}

func TestAgent_ServerConfig_RaftMultiplier_Ok(t *testing.T) {
t.Parallel()

cases := []struct {
multiplier *int
electionTimout time.Duration
heartbeatTimeout time.Duration
leaderLeaseTimeout time.Duration
commitTimeout time.Duration
}{
// nil, 0 are the defaults of the Raft library.
// Expected values are hardcoded to detect changes from raft.
{
multiplier: nil,

electionTimout: 1 * time.Second,
heartbeatTimeout: 1 * time.Second,
leaderLeaseTimeout: 500 * time.Millisecond,
commitTimeout: 50 * time.Millisecond,
},

{
multiplier: helper.IntToPtr(0),

electionTimout: 1 * time.Second,
heartbeatTimeout: 1 * time.Second,
leaderLeaseTimeout: 500 * time.Millisecond,
commitTimeout: 50 * time.Millisecond,
},
{
multiplier: helper.IntToPtr(1),

electionTimout: 1 * time.Second,
heartbeatTimeout: 1 * time.Second,
leaderLeaseTimeout: 500 * time.Millisecond,
commitTimeout: 50 * time.Millisecond,
},
{
multiplier: helper.IntToPtr(5),

electionTimout: 5 * time.Second,
heartbeatTimeout: 5 * time.Second,
leaderLeaseTimeout: 2500 * time.Millisecond,
commitTimeout: 250 * time.Millisecond,
},
{
multiplier: helper.IntToPtr(6),

electionTimout: 6 * time.Second,
heartbeatTimeout: 6 * time.Second,
leaderLeaseTimeout: 3000 * time.Millisecond,
commitTimeout: 300 * time.Millisecond,
},
{
multiplier: helper.IntToPtr(10),

electionTimout: 10 * time.Second,
heartbeatTimeout: 10 * time.Second,
leaderLeaseTimeout: 5000 * time.Millisecond,
commitTimeout: 500 * time.Millisecond,
},
}

for _, tc := range cases {
v := "default"
if tc.multiplier != nil {
v = fmt.Sprintf("%v", *tc.multiplier)
}
t.Run(v, func(t *testing.T) {
conf := DevConfig(nil)
require.NoError(t, conf.normalizeAddrs())

conf.Server.RaftMultiplier = tc.multiplier

serverConf, err := convertServerConfig(conf)
require.NoError(t, err)

assert.Equal(t, tc.electionTimout, serverConf.RaftConfig.ElectionTimeout, "election timeout")
assert.Equal(t, tc.heartbeatTimeout, serverConf.RaftConfig.HeartbeatTimeout, "heartbeat timeout")
assert.Equal(t, tc.leaderLeaseTimeout, serverConf.RaftConfig.LeaderLeaseTimeout, "leader lease timeout")
assert.Equal(t, tc.commitTimeout, serverConf.RaftConfig.CommitTimeout, "commit timeout")
})
}
}

func TestAgent_ServerConfig_RaftMultiplier_Bad(t *testing.T) {
t.Parallel()

cases := []int{
-1,
100,
}

for _, tc := range cases {
t.Run(fmt.Sprintf("%v", tc), func(t *testing.T) {
conf := DevConfig(nil)
require.NoError(t, conf.normalizeAddrs())

conf.Server.RaftMultiplier = &tc

_, err := convertServerConfig(conf)
require.Error(t, err)
require.Contains(t, err.Error(), "raft_multiplier cannot be")
})
}
}

func TestAgent_ClientConfig(t *testing.T) {
t.Parallel()
conf := DefaultConfig()
Expand Down
7 changes: 7 additions & 0 deletions command/agent/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,9 @@ type ServerConfig struct {
// RaftProtocol is the Raft protocol version to speak. This must be from [1-3].
RaftProtocol int `hcl:"raft_protocol"`

// RaftMultiplier scales the Raft timing parameters
RaftMultiplier *int `hcl:"raft_multiplier"`

// NumSchedulers is the number of scheduler thread that are run.
// This can be as many as one per core, or zero to disable this server
// from doing any scheduling work.
Expand Down Expand Up @@ -1315,6 +1318,10 @@ func (a *ServerConfig) Merge(b *ServerConfig) *ServerConfig {
if b.RaftProtocol != 0 {
result.RaftProtocol = b.RaftProtocol
}
if b.RaftMultiplier != nil {
c := *b.RaftMultiplier
result.RaftMultiplier = &c
}
if b.NumSchedulers != nil {
result.NumSchedulers = helper.IntToPtr(*b.NumSchedulers)
}
Expand Down
1 change: 1 addition & 0 deletions command/agent/config_parse_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ var basicConfig = &Config{
DataDir: "/tmp/data",
ProtocolVersion: 3,
RaftProtocol: 3,
RaftMultiplier: helper.IntToPtr(4),
NumSchedulers: helper.IntToPtr(2),
EnabledSchedulers: []string{"test"},
NodeGCThreshold: "12h",
Expand Down
6 changes: 3 additions & 3 deletions command/agent/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ func TestConfig_Merge(t *testing.T) {
DataDir: "/tmp/data1",
ProtocolVersion: 1,
RaftProtocol: 1,
RaftMultiplier: helper.IntToPtr(5),
NumSchedulers: helper.IntToPtr(1),
NodeGCThreshold: "1h",
HeartbeatGrace: 30 * time.Second,
Expand Down Expand Up @@ -317,6 +318,7 @@ func TestConfig_Merge(t *testing.T) {
DataDir: "/tmp/data2",
ProtocolVersion: 2,
RaftProtocol: 2,
RaftMultiplier: helper.IntToPtr(6),
NumSchedulers: helper.IntToPtr(2),
EnabledSchedulers: []string{structs.JobTypeBatch},
NodeGCThreshold: "12h",
Expand Down Expand Up @@ -425,9 +427,7 @@ func TestConfig_Merge(t *testing.T) {
result := c0.Merge(c1)
result = result.Merge(c2)
result = result.Merge(c3)
if !reflect.DeepEqual(result, c3) {
t.Fatalf("bad:\n%#v\n%#v", result, c3)
}
require.Equal(t, c3, result)
}

func TestConfig_ParseConfigFile(t *testing.T) {
Expand Down
1 change: 1 addition & 0 deletions command/agent/testdata/basic.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ server {
redundancy_zone = "foo"
upgrade_version = "0.8.0"
encrypt = "abc"
raft_multiplier = 4

server_join {
retry_join = ["1.1.1.1", "2.2.2.2"]
Expand Down
1 change: 1 addition & 0 deletions command/agent/testdata/basic.json
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,7 @@
"num_schedulers": 2,
"protocol_version": 3,
"raft_protocol": 3,
"raft_multiplier": 4,
"redundancy_zone": "foo",
"rejoin_after_leave": true,
"retry_interval": "15s",
Expand Down
21 changes: 20 additions & 1 deletion website/pages/docs/configuration/server.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,25 @@ server {
features and is typically not required as the agent internally knows the
latest version, but may be useful in some upgrade scenarios.

- `raft_multiplier` `(int: 1)` - An integer multiplier used by Nomad servers to
scale key Raft timing parameters. Omitting this value or setting it to 0 uses
default timing described below. Lower values are used to tighten timing and
increase sensitivity while higher values relax timings and reduce sensitivity.
Tuning this affects the time it takes Nomad to detect leader failures and to
perform leader elections, at the expense of requiring more network and CPU
resources for better performance. The maximum allowed value is 10.

By default, Nomad will use the highest-performance timing, currently equivalent
to setting this to a value of 1. Increasing the timings makes leader election
less likely during periods of networking issues or resource starvation. Since
leader elections pause Nomad's normal work, it may be beneficial for slow or
unreliable networks to wait longer before electing a new leader. The tradeoff
when raising this value is that during network partitions or other events
(server crash) where a leader is lost, Nomad will not elect a new leader for
a longer period of time than the default. The ['nomad.nomad.leader.barrier' and
`nomad.raft.leader.lastContact` metrics](/docs/telemetry/metrics is a good
indicator of how often leader elections occur and raft latency.

- `redundancy_zone` `(string: "")` - (Enterprise-only) Specifies the redundancy
zone that this server will be a part of for Autopilot management. For more
information, see the [Autopilot Guide](https://learn.hashicorp.com/nomad/operating-nomad/autopilot).
Expand Down Expand Up @@ -275,4 +294,4 @@ server {
[encryption]: https://learn.hashicorp.com/nomad/transport-security/gossip-encryption 'Nomad Encryption Overview'
[server-join]: /docs/configuration/server_join 'Server Join'
[update-scheduler-config]: /api-docs/operator#update-scheduler-configuration 'Scheduler Config'
[bootstrapping a cluster]: /docs/faq#bootstrapping
[bootstrapping a cluster]: /docs/faq#bootstrapping

0 comments on commit 0821c0a

Please sign in to comment.