diff --git a/command/agent/command.go b/command/agent/command.go index 98c6fa3a5be..74bbf96149e 100644 --- a/command/agent/command.go +++ b/command/agent/command.go @@ -598,6 +598,8 @@ WAIT: } } +// reloadHTTPServer shuts down the existing HTTP server and restarts it. This +// is helpful when reloading the agent configuration. func (c *Command) reloadHTTPServer() error { c.agent.logger.Println("[INFO] agent: Reloading HTTP server with new TLS configuration") diff --git a/nomad/raft_rpc.go b/nomad/raft_rpc.go index 4cad9e734a6..e7f73357d57 100644 --- a/nomad/raft_rpc.go +++ b/nomad/raft_rpc.go @@ -21,7 +21,8 @@ type RaftLayer struct { connCh chan net.Conn // TLS wrapper - tlsWrap tlsutil.Wrapper + tlsWrap tlsutil.Wrapper + tlsWrapLock sync.RWMutex // Tracks if we are closed closed bool @@ -78,6 +79,21 @@ func (l *RaftLayer) Close() error { return nil } +// getTLSWrapper is used to retrieve the current TLS wrapper +func (l *RaftLayer) getTLSWrapper() tlsutil.Wrapper { + l.tlsWrapLock.RLock() + defer l.tlsWrapLock.RUnlock() + return l.tlsWrap +} + +// ReloadTLS swaps the TLS wrapper. This is useful when upgrading or +// downgrading TLS connections. +func (l *RaftLayer) ReloadTLS(tlsWrap tlsutil.Wrapper) { + l.tlsWrapLock.Lock() + defer l.tlsWrapLock.Unlock() + l.tlsWrap = tlsWrap +} + // Addr is used to return the address of the listener func (l *RaftLayer) Addr() net.Addr { return l.addr @@ -90,8 +106,10 @@ func (l *RaftLayer) Dial(address raft.ServerAddress, timeout time.Duration) (net return nil, err } + tlsWrapper := l.getTLSWrapper() + // Check for tls mode - if l.tlsWrap != nil { + if tlsWrapper != nil { // Switch the connection into TLS mode if _, err := conn.Write([]byte{byte(rpcTLS)}); err != nil { conn.Close() @@ -99,7 +117,7 @@ func (l *RaftLayer) Dial(address raft.ServerAddress, timeout time.Duration) (net } // Wrap the connection in a TLS client - conn, err = l.tlsWrap(conn) + conn, err = tlsWrapper(conn) if err != nil { return nil, err } diff --git a/nomad/server.go b/nomad/server.go index f540d53b7b9..41e58ded0bc 100644 --- a/nomad/server.go +++ b/nomad/server.go @@ -84,8 +84,7 @@ const ( // Server is Nomad server which manages the job queues, // schedulers, and notification bus for agents. type Server struct { - config *Config - configLock sync.Mutex + config *Config logger *log.Logger @@ -97,12 +96,11 @@ type Server struct { // The raft instance is used among Nomad nodes within the // region to protect operations that require strong consistency - leaderCh <-chan bool - raft *raft.Raft - raftLayer *RaftLayer - raftStore *raftboltdb.BoltStore - raftInmem *raft.InmemStore - + leaderCh <-chan bool + raft *raft.Raft + raftLayer *RaftLayer + raftStore *raftboltdb.BoltStore + raftInmem *raft.InmemStore raftTransport *raft.NetworkTransport // fsm is the state machine used with Raft @@ -417,9 +415,7 @@ func (s *Server) reloadTLSConnections(newTLSConfig *config.TLSConfig) error { // Keeping configuration in sync is important for other places that require // access to config information, such as rpc.go, where we decide on what kind // of network connections to accept depending on the server configuration - s.configLock.Lock() s.config.TLSConfig = newTLSConfig - s.configLock.Unlock() s.rpcTLS = incomingTLS s.connPool.ReloadTLS(tlsWrap) @@ -436,13 +432,9 @@ func (s *Server) reloadTLSConnections(newTLSConfig *config.TLSConfig) error { } // Close and reload existing Raft connections - s.raftTransport.Pause() - s.raftLayer.Close() wrapper := tlsutil.RegionSpecificWrapper(s.config.Region, tlsWrap) - s.raftLayer = NewRaftLayer(s.rpcAdvertise, wrapper) - s.raftTransport.Reload(s.raftLayer) - - time.Sleep(3 * time.Second) + s.raftLayer.ReloadTLS(wrapper) + s.raftTransport.CloseStreams() s.logger.Printf("[DEBUG] nomad: finished reloading server connections") return nil diff --git a/nomad/server_test.go b/nomad/server_test.go index 34b8ff11382..bb3381293a1 100644 --- a/nomad/server_test.go +++ b/nomad/server_test.go @@ -14,7 +14,6 @@ import ( "time" "github.com/hashicorp/consul/lib/freeport" - memdb "github.com/hashicorp/go-memdb" msgpackrpc "github.com/hashicorp/net-rpc-msgpackrpc" "github.com/hashicorp/nomad/command/agent/consul" "github.com/hashicorp/nomad/helper/uuid" @@ -417,52 +416,9 @@ func TestServer_Reload_TLSConnections_Raft(t *testing.T) { defer s2.Shutdown() testJoin(t, s1, s2) + servers := []*Server{s1, s2} - testutil.WaitForResult(func() (bool, error) { - peers, _ := s1.numPeers() - return peers == 2, nil - }, func(err error) { - t.Fatalf("should have 2 peers") - }) - - testutil.WaitForLeader(t, s2.RPC) - - { - // assert that a job register request will succeed - codec := rpcClient(t, s2) - job := mock.Job() - req := &structs.JobRegisterRequest{ - Job: job, - WriteRequest: structs.WriteRequest{ - Region: "regionFoo", - Namespace: job.Namespace, - }, - } - - // Fetch the response - var resp structs.JobRegisterResponse - err := msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp) - assert.Nil(err) - assert.NotEqual(0, resp.Index) - - // Check for the job in the FSM of each server in the cluster - { - state := s2.fsm.State() - ws := memdb.NewWatchSet() - out, err := state.JobByID(ws, job.Namespace, job.ID) - assert.Nil(err) - assert.NotNil(out) - assert.Equal(out.CreateIndex, resp.JobModifyIndex) - } - { - state := s1.fsm.State() - ws := memdb.NewWatchSet() - out, err := state.JobByID(ws, job.Namespace, job.ID) - assert.Nil(err) - assert.NotNil(out) - assert.Equal(out.CreateIndex, resp.JobModifyIndex) - } - } + testutil.WaitForLeader(t, s1.RPC) newTLSConfig := &config.TLSConfig{ EnableHTTP: true, @@ -476,29 +432,19 @@ func TestServer_Reload_TLSConnections_Raft(t *testing.T) { assert.Nil(err) { - // assert that a job register request will fail between servers that - // should not be able to communicate over Raft - codec := rpcClient(t, s2) - job := mock.Job() - req := &structs.JobRegisterRequest{ - Job: job, - WriteRequest: structs.WriteRequest{ - Region: "regionFoo", - Namespace: job.Namespace, - }, + for _, serv := range servers { + testutil.WaitForResult(func() (bool, error) { + args := &structs.GenericRequest{} + var leader string + err := serv.RPC("Status.Leader", args, &leader) + if leader != "" && err != nil { + return false, fmt.Errorf("Should not have found leader but got %s", leader) + } + return true, nil + }, func(err error) { + t.Fatalf("err: %v", err) + }) } - - // TODO(CK) This occasionally is flaky - var resp structs.JobRegisterResponse - err := msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp) - assert.NotNil(err) - assert.True(connectionReset(err.Error())) - - // Check that the job was not persisted - state := s1.fsm.State() - ws := memdb.NewWatchSet() - out, _ := state.JobByID(ws, job.Namespace, job.ID) - assert.Nil(out) } secondNewTLSConfig := &config.TLSConfig{ @@ -515,42 +461,4 @@ func TestServer_Reload_TLSConnections_Raft(t *testing.T) { assert.Nil(err) testutil.WaitForLeader(t, s2.RPC) - - { - // assert that a job register request will succeed - codec := rpcClient(t, s2) - - job := mock.Job() - req := &structs.JobRegisterRequest{ - Job: job, - WriteRequest: structs.WriteRequest{ - Region: "regionFoo", - Namespace: job.Namespace, - }, - } - - // Fetch the response - var resp structs.JobRegisterResponse - err := msgpackrpc.CallWithCodec(codec, "Job.Register", req, &resp) - assert.Nil(err) - assert.NotEqual(0, resp.Index) - - // Check for the job in the FSM of each server in the cluster - { - state := s2.fsm.State() - ws := memdb.NewWatchSet() - out, err := state.JobByID(ws, job.Namespace, job.ID) - assert.Nil(err) - assert.NotNil(out) // TODO(CK) This occasionally is flaky - assert.Equal(out.CreateIndex, resp.JobModifyIndex) - } - { - state := s1.fsm.State() - ws := memdb.NewWatchSet() - out, err := state.JobByID(ws, job.Namespace, job.ID) - assert.Nil(err) - assert.NotNil(out) - assert.Equal(out.CreateIndex, resp.JobModifyIndex) - } - } }