From 040b41b4fee6b5bad7ccbeebdaa4cb9e8b790083 Mon Sep 17 00:00:00 2001 From: Michael Yuen Date: Sun, 21 Jun 2020 13:17:49 +0200 Subject: [PATCH 1/2] Fix issue where consul esm can generate spurious updates ESM can generate many repeated updates while it waits for local agents or server repliacs to catch up, in the case of a service thats getting a lot of updates in a short period of time this can delay updating any health checks for a while (we've observed metrics showing this to be in the order of 5s for some consul cluster configurations) This change makes sure to do a read from the leader before making a CAS update to the check value --- check.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/check.go b/check.go index ea36b41..503b2da 100644 --- a/check.go +++ b/check.go @@ -308,7 +308,7 @@ func (c *CheckRunner) UpdateCheck(checkID types.CheckID, status, output string) // Should only be called when the lock is held. func (c *CheckRunner) handleCheckUpdate(check *api.HealthCheck, status, output string) { // Exit early if the check or node have been deregistered. - checks, _, err := c.client.Health().Node(check.Node, nil) + checks, _, err := c.client.Health().Node(check.Node, &api.QueryOptions{RequireConsistent: true}) if err != nil { c.logger.Printf("[WARN] error retrieving existing node entry: %v", err) return @@ -348,7 +348,7 @@ func (c *CheckRunner) handleCheckUpdate(check *api.HealthCheck, status, output s for _, e := range resp.Errors { errs = multierror.Append(errs, errors.New(e.What)) } - c.logger.Printf("[WARN] Error updating check status in Consul: %v", errs) + c.logger.Printf("[WARN] Error(s) returned from txn when updating check status in Consul: %v", errs) return } if !ok { From 867c2c329bad037029e039b5784209a588f5063e Mon Sep 17 00:00:00 2001 From: Michael Yuen Date: Tue, 23 Jun 2020 19:40:06 +0100 Subject: [PATCH 2/2] Update check.go Co-authored-by: lornasong --- check.go | 1 + 1 file changed, 1 insertion(+) diff --git a/check.go b/check.go index 503b2da..a2ddeb1 100644 --- a/check.go +++ b/check.go @@ -308,6 +308,7 @@ func (c *CheckRunner) UpdateCheck(checkID types.CheckID, status, output string) // Should only be called when the lock is held. func (c *CheckRunner) handleCheckUpdate(check *api.HealthCheck, status, output string) { // Exit early if the check or node have been deregistered. + // consistent mode reduces convergency time particularly when services have many updates in a short time checks, _, err := c.client.Health().Node(check.Node, &api.QueryOptions{RequireConsistent: true}) if err != nil { c.logger.Printf("[WARN] error retrieving existing node entry: %v", err)