diff --git a/nomad/server.go b/nomad/server.go index f6fbea91de6..6aa6db0a43f 100644 --- a/nomad/server.go +++ b/nomad/server.go @@ -1245,6 +1245,10 @@ func (s *Server) setupSerf(conf *serf.Config, ch chan serf.Event, path string) ( } conf.ProtocolVersion = protocolVersionMap[s.config.ProtocolVersion] conf.RejoinAfterLeave = true + // LeavePropagateDelay is used to make sure broadcasted leave intents propagate + // This value was tuned using https://www.serf.io/docs/internals/simulator.html to + // allow for convergence in 99.9% of nodes in a 10 node cluster + conf.LeavePropagateDelay = 1 * time.Second conf.Merge = &serfMergeDelegate{} // Until Nomad supports this fully, we disable automatic resolution. diff --git a/vendor/github.com/hashicorp/serf/serf/config.go b/vendor/github.com/hashicorp/serf/serf/config.go index ad4f51b18a7..79f36f57c75 100644 --- a/vendor/github.com/hashicorp/serf/serf/config.go +++ b/vendor/github.com/hashicorp/serf/serf/config.go @@ -55,6 +55,13 @@ type Config struct { // set, a timeout of 5 seconds will be set. BroadcastTimeout time.Duration + // LeavePropagateDelay is for our leave (node dead) message to propagate + // through the cluster. In particular, we want to stay up long enough to + // service any probes from other nodes before they learn about us + // leaving and stop probing. Otherwise, we risk getting node failures as + // we leave. + LeavePropagateDelay time.Duration + // The settings below relate to Serf's event coalescence feature. Serf // is able to coalesce multiple events into single events in order to // reduce the amount of noise that is sent along the EventCh. For example @@ -255,6 +262,7 @@ func DefaultConfig() *Config { return &Config{ NodeName: hostname, BroadcastTimeout: 5 * time.Second, + LeavePropagateDelay: 1 * time.Second, EventBuffer: 512, QueryBuffer: 512, LogOutput: os.Stderr, diff --git a/vendor/github.com/hashicorp/serf/serf/delegate.go b/vendor/github.com/hashicorp/serf/serf/delegate.go index 15353150274..871b72e5030 100644 --- a/vendor/github.com/hashicorp/serf/serf/delegate.go +++ b/vendor/github.com/hashicorp/serf/serf/delegate.go @@ -223,13 +223,16 @@ func (d *delegate) MergeRemoteState(buf []byte, isJoin bool) { d.serf.queryClock.Witness(pp.QueryLTime - 1) } - // Process the left nodes first to avoid the LTimes from being increment - // in the wrong order + // Process the left nodes first to avoid the LTimes from incrementing + // in the wrong order. Note that we don't have the actual Lamport time + // for the leave message, so we go one past the join time, since the + // leave must have been accepted after that to get onto the left members + // list. If we didn't do this then the message would not get processed. leftMap := make(map[string]struct{}, len(pp.LeftMembers)) leave := messageLeave{} for _, name := range pp.LeftMembers { leftMap[name] = struct{}{} - leave.LTime = pp.StatusLTimes[name] + leave.LTime = pp.StatusLTimes[name] + 1 leave.Node = name d.serf.handleNodeLeaveIntent(&leave) } diff --git a/vendor/github.com/hashicorp/serf/serf/serf.go b/vendor/github.com/hashicorp/serf/serf/serf.go index 3e89fa11f8b..548807a9d80 100644 --- a/vendor/github.com/hashicorp/serf/serf/serf.go +++ b/vendor/github.com/hashicorp/serf/serf/serf.go @@ -691,6 +691,13 @@ func (s *Serf) Leave() error { return err } + // Wait for the leave to propagate through the cluster. The broadcast + // timeout is how long we wait for the message to go out from our own + // queue, but this wait is for that message to propagate through the + // cluster. In particular, we want to stay up long enough to service + // any probes from other nodes before they learn about us leaving. + time.Sleep(s.config.LeavePropagateDelay) + // Transition to Left only if we not already shutdown s.stateLock.Lock() if s.state != SerfShutdown { diff --git a/vendor/vendor.json b/vendor/vendor.json index 9769cfa3d93..73447079b8a 100644 --- a/vendor/vendor.json +++ b/vendor/vendor.json @@ -180,8 +180,8 @@ {"path":"github.com/hashicorp/net-rpc-msgpackrpc","revision":"a14192a58a694c123d8fe5481d4a4727d6ae82f3"}, {"path":"github.com/hashicorp/raft","checksumSHA1":"zkA9uvbj1BdlveyqXpVTh1N6ers=","revision":"077966dbc90f342107eb723ec52fdb0463ec789b","revisionTime":"2018-01-17T20:29:25Z","version":"master","versionExact":"master"}, {"path":"github.com/hashicorp/raft-boltdb","checksumSHA1":"QAxukkv54/iIvLfsUP6IK4R0m/A=","revision":"d1e82c1ec3f15ee991f7cc7ffd5b67ff6f5bbaee","revisionTime":"2015-02-01T20:08:39Z"}, - {"path":"github.com/hashicorp/serf/coordinate","checksumSHA1":"0PeWsO2aI+2PgVYlYlDPKfzCLEQ=","revision":"fc4bdedf2366c64984e280c6eefc703ca7812585","revisionTime":"2018-04-11T17:01:37Z"}, - {"path":"github.com/hashicorp/serf/serf","checksumSHA1":"YzJaaeIJpxLfVDZYT1X2hpd8IK8=","revision":"fc4bdedf2366c64984e280c6eefc703ca7812585","revisionTime":"2018-04-11T17:01:37Z"}, + {"path":"github.com/hashicorp/serf/coordinate","checksumSHA1":"0PeWsO2aI+2PgVYlYlDPKfzCLEQ=","revision":"80ab48778deee28e4ea2dc4ef1ebb2c5f4063996","revisionTime":"2018-05-07T23:19:28Z"}, + {"path":"github.com/hashicorp/serf/serf","checksumSHA1":"QrT+nzyXsD/MmhTjjhcPdnALZ1I=","revision":"80ab48778deee28e4ea2dc4ef1ebb2c5f4063996","revisionTime":"2018-05-07T23:19:28Z"}, {"path":"github.com/hashicorp/vault","checksumSHA1":"eGzvBRMFD6ZB3A6uO750np7Om/E=","revision":"182ba68a9589d4cef95234134aaa498a686e3de3","revisionTime":"2016-08-21T23:40:57Z"}, {"path":"github.com/hashicorp/vault/api","checksumSHA1":"mKN4rEIWyflT6aqJyjgu9m1tPXI=","revision":"3ddd3bd20cec0588788547aecd15e91461b9d546","revisionTime":"2018-04-03T21:11:47Z"}, {"path":"github.com/hashicorp/vault/helper/compressutil","checksumSHA1":"jHVLe8KMdEpb/ZALp0zu+tenADo=","revision":"3ddd3bd20cec0588788547aecd15e91461b9d546","revisionTime":"2018-04-03T21:11:47Z"},