From 33cf70a76786c59cce7b68854b14ba187520ef66 Mon Sep 17 00:00:00 2001 From: Seth Hoenig Date: Mon, 26 Apr 2021 13:24:06 -0600 Subject: [PATCH] connect: use exp backoff when waiting on consul envoy bootstrap This PR wraps the use of the consul envoy bootstrap command in an expoenential backoff closure, configured to timeout after 60 seconds. This is an increase over the current behavior of making 3 attempts over 6 seconds. Should help with #10451 --- .../taskrunner/envoy_bootstrap_hook.go | 129 ++++++++++------ .../taskrunner/envoy_bootstrap_hook_test.go | 99 ++++++++++++ helper/exptime/backoff.go | 142 ++++++++++++++++++ 3 files changed, 324 insertions(+), 46 deletions(-) create mode 100644 helper/exptime/backoff.go diff --git a/client/allocrunner/taskrunner/envoy_bootstrap_hook.go b/client/allocrunner/taskrunner/envoy_bootstrap_hook.go index 4489b14f6a8..d2ef3df53d1 100644 --- a/client/allocrunner/taskrunner/envoy_bootstrap_hook.go +++ b/client/allocrunner/taskrunner/envoy_bootstrap_hook.go @@ -16,6 +16,7 @@ import ( "github.com/hashicorp/nomad/client/taskenv" agentconsul "github.com/hashicorp/nomad/command/agent/consul" "github.com/hashicorp/nomad/helper" + "github.com/hashicorp/nomad/helper/exptime" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/nomad/structs/config" "github.com/pkg/errors" @@ -23,6 +24,21 @@ import ( const envoyBootstrapHookName = "envoy_bootstrap" +const ( + // envoyBootstrapWaitTime is the amount of time this hook should wait on Consul + // objects to be created before giving up. + envoyBootstrapWaitTime = 60 * time.Second + + // envoyBootstrapInitialGap is the initial amount of time the envoy bootstrap + // retry loop will wait, exponentially increasing each iteration, not including + // jitter. + envoyBoostrapInitialGap = 1 * time.Second + + // envoyBootstrapMaxJitter is the maximum amount of jitter applied to the + // wait gap each iteration of the envoy bootstrap retry loop. + envoyBootstrapMaxJitter = 500 * time.Millisecond +) + type consulTransportConfig struct { HTTPAddr string // required Auth string // optional, env CONSUL_HTTP_AUTH @@ -100,16 +116,32 @@ type envoyBootstrapHook struct { // consulNamespace is the Consul namespace as set by in the job consulNamespace string + // envoyBootstrapWaitTime is the total amount of time hook will wait for Consul + envoyBootstrapWaitTime time.Duration + + // envoyBootstrapInitialGap is the initial wait gap when retyring + envoyBoostrapInitialGap time.Duration + + // envoyBootstrapMaxJitter is the maximum amount of jitter applied to retries + envoyBootstrapMaxJitter time.Duration + + // envoyBootstrapExpSleep controls exponential waiting + envoyBootstrapExpSleep func(time.Duration) + // logger is used to log things logger hclog.Logger } func newEnvoyBootstrapHook(c *envoyBootstrapHookConfig) *envoyBootstrapHook { return &envoyBootstrapHook{ - alloc: c.alloc, - consulConfig: c.consul, - consulNamespace: c.consulNamespace, - logger: c.logger.Named(envoyBootstrapHookName), + alloc: c.alloc, + consulConfig: c.consul, + consulNamespace: c.consulNamespace, + envoyBootstrapWaitTime: envoyBootstrapWaitTime, + envoyBoostrapInitialGap: envoyBoostrapInitialGap, + envoyBootstrapMaxJitter: envoyBootstrapMaxJitter, + envoyBootstrapExpSleep: time.Sleep, + logger: c.logger.Named(envoyBootstrapHookName), } } @@ -221,66 +253,71 @@ func (h *envoyBootstrapHook) Prestart(ctx context.Context, req *ifs.TaskPrestart bootstrapArgs := bootstrap.args() bootstrapEnv := bootstrap.env(os.Environ()) + // keep track of latest error returned from exec-ing consul envoy bootstrap + var cmdErr error + // Since Consul services are registered asynchronously with this task - // hook running, retry a small number of times with backoff. - for tries := 3; ; tries-- { + // hook running, retry until timeout or success. + if backoffErr := exptime.Backoff(func() (bool, error) { + + // If hook is killed, just stop. + select { + case <-ctx.Done(): + return false, nil + default: + } + // Prepare bootstrap command to run. cmd := exec.CommandContext(ctx, "consul", bootstrapArgs...) cmd.Env = bootstrapEnv - // Redirect output to secrets/envoy_bootstrap.json - fd, err := os.Create(bootstrapFilePath) - if err != nil { - return fmt.Errorf("error creating secrets/envoy_bootstrap.json for envoy: %v", err) + // Redirect stdout to secrets/envoy_bootstrap.json. + fd, fileErr := os.Create(bootstrapFilePath) + if fileErr != nil { + return false, fmt.Errorf("failed to create secrets/envoy_bootstrap.json for envoy: %w", fileErr) } cmd.Stdout = fd + // Redirect stderr into a buffer for later reading. buf := bytes.NewBuffer(nil) cmd.Stderr = buf // Generate bootstrap - err = cmd.Run() + cmdErr = cmd.Run() - // Close bootstrap.json - fd.Close() + // Close bootstrap.json regardless of any command errors. + _ = fd.Close() - if err == nil { - // Happy path! Bootstrap was created, exit. - break + // Command succeeded, exit. + if cmdErr == nil { + // Bootstrap written. Mark as done and move on. + resp.Done = true + return false, nil } - // Check for error from command - if tries == 0 { - h.logger.Error("error creating bootstrap configuration for Connect proxy sidecar", "error", err, "stderr", buf.String()) - - // Cleanup the bootstrap file. An errors here is not - // important as (a) we test to ensure the deletion - // occurs, and (b) the file will either be rewritten on - // retry or eventually garbage collected if the task - // fails. - os.Remove(bootstrapFilePath) - - // ExitErrors are recoverable since they indicate the - // command was runnable but exited with a unsuccessful - // error code. - _, recoverable := err.(*exec.ExitError) - return structs.NewRecoverableError( - fmt.Errorf("error creating bootstrap configuration for Connect proxy sidecar: %v", err), - recoverable, - ) - } - - // Sleep before retrying to give Consul services time to register - select { - case <-time.After(2 * time.Second): - case <-ctx.Done(): - // Killed before bootstrap, exit without setting Done - return nil - } + // Command failed, prepare for retry + // + // Cleanup the bootstrap file. An errors here is not + // important as (a) we test to ensure the deletion + // occurs, and (b) the file will either be rewritten on + // retry or eventually garbage collected if the task + // fails. + _ = os.Remove(bootstrapFilePath) + + return true, cmdErr + }, exptime.BackoffOptions{ + MaxSleepTime: h.envoyBootstrapWaitTime, + InitialGapSize: h.envoyBoostrapInitialGap, + MaxJitterSize: h.envoyBootstrapMaxJitter, + }); backoffErr != nil { + // Wrap the last error from Consul and set that as our status. + _, recoverable := cmdErr.(*exec.ExitError) + return structs.NewRecoverableError( + fmt.Errorf("error creating bootstrap configuration for Connect proxy sidecar: %v", cmdErr), + recoverable, + ) } - // Bootstrap written. Mark as done and move on. - resp.Done = true return nil } diff --git a/client/allocrunner/taskrunner/envoy_bootstrap_hook_test.go b/client/allocrunner/taskrunner/envoy_bootstrap_hook_test.go index 927ae7b0823..5f9d78dfd7a 100644 --- a/client/allocrunner/taskrunner/envoy_bootstrap_hook_test.go +++ b/client/allocrunner/taskrunner/envoy_bootstrap_hook_test.go @@ -12,6 +12,7 @@ import ( "os" "path/filepath" "testing" + "time" consulapi "github.com/hashicorp/consul/api" "github.com/hashicorp/nomad/client/allocdir" @@ -625,6 +626,11 @@ func TestTaskRunner_EnvoyBootstrapHook_RecoverableError(t *testing.T) { h := newEnvoyBootstrapHook(newEnvoyBootstrapHookConfig(alloc, &config.ConsulConfig{ Addr: testConsul.HTTPAddr, }, consulNamespace, logger)) + + // Lower the allowable wait time for testing + h.envoyBootstrapWaitTime = 1 * time.Second + h.envoyBoostrapInitialGap = 100 * time.Millisecond + req := &interfaces.TaskPrestartRequest{ Task: sidecarTask, TaskDir: allocDir.NewTaskDir(sidecarTask.Name), @@ -648,6 +654,99 @@ func TestTaskRunner_EnvoyBootstrapHook_RecoverableError(t *testing.T) { require.True(t, os.IsNotExist(err)) } +func TestTaskRunner_EnvoyBootstrapHook_retryTimeout(t *testing.T) { + t.Parallel() + logger := testlog.HCLogger(t) + + testConsul := getTestConsul(t) + defer testConsul.Stop() + + begin := time.Now() + + // Setup an Allocation + alloc := mock.ConnectAlloc() + alloc.AllocatedResources.Shared.Networks = []*structs.NetworkResource{ + { + Mode: "bridge", + IP: "10.0.0.1", + DynamicPorts: []structs.Port{ + { + Label: "connect-proxy-foo", + Value: 9999, + To: 9999, + }, + }, + }, + } + tg := alloc.Job.TaskGroups[0] + tg.Services = []*structs.Service{ + { + Name: "foo", + PortLabel: "9999", // Just need a valid port, nothing will bind to it + Connect: &structs.ConsulConnect{ + SidecarService: &structs.ConsulSidecarService{}, + }, + }, + } + sidecarTask := &structs.Task{ + Name: "sidecar", + Kind: structs.NewTaskKind(structs.ConnectProxyPrefix, "foo"), + } + tg.Tasks = append(tg.Tasks, sidecarTask) + allocDir, cleanupAlloc := allocdir.TestAllocDir(t, logger, "EnvoyBootstrapRetryTimeout") + defer cleanupAlloc() + + // Get a Consul client + consulConfig := consulapi.DefaultConfig() + consulConfig.Address = testConsul.HTTPAddr + + // Do NOT register group services, causing the hook to retry until timeout + + // Run Connect bootstrap hook + h := newEnvoyBootstrapHook(newEnvoyBootstrapHookConfig(alloc, &config.ConsulConfig{ + Addr: consulConfig.Address, + }, consulNamespace, logger)) + + // Keep track of the retry backoff iterations + iterations := 0 + + // Lower the allowable wait time for testing + h.envoyBootstrapWaitTime = 3 * time.Second + h.envoyBoostrapInitialGap = 1 * time.Second + h.envoyBootstrapExpSleep = func(d time.Duration) { + iterations++ + time.Sleep(d) + } + + // Create the prestart request + req := &interfaces.TaskPrestartRequest{ + Task: sidecarTask, + TaskDir: allocDir.NewTaskDir(sidecarTask.Name), + TaskEnv: taskenv.NewEmptyTaskEnv(), + } + require.NoError(t, req.TaskDir.Build(false, nil)) + + var resp interfaces.TaskPrestartResponse + + // Run the hook and get the error + err := h.Prestart(context.Background(), req, &resp) + require.EqualError(t, err, "error creating bootstrap configuration for Connect proxy sidecar: exit status 1") + + // Current time should be at least start time + total wait time + minimum := begin.Add(h.envoyBootstrapWaitTime) + require.True(t, time.Now().After(minimum)) + + // Should hit at least 2 iterations + require.Greater(t, 2, iterations) + + // Make sure we captured the recoverable-ness of the error + _, ok := err.(*structs.RecoverableError) + require.True(t, ok) + + // Assert the hook is not done (it failed) + require.False(t, resp.Done) +} + func TestTaskRunner_EnvoyBootstrapHook_extractNameAndKind(t *testing.T) { t.Run("connect sidecar", func(t *testing.T) { kind, name, err := (*envoyBootstrapHook)(nil).extractNameAndKind( diff --git a/helper/exptime/backoff.go b/helper/exptime/backoff.go new file mode 100644 index 00000000000..8213e65cb3a --- /dev/null +++ b/helper/exptime/backoff.go @@ -0,0 +1,142 @@ +// Package exptime provides a generalized exponential backoff retry implementation. +// +// This package was copied from oss.indeed.com/go/libtime/decay and modified. +package exptime + +import ( + "errors" + "fmt" + "math/rand" + "time" +) + +var ( + // ErrMaximumTimeExceeded indicates the maximum wait time has been exceeded. + ErrMaximumTimeExceeded = errors.New("maximum backoff time exceeded") +) + +// A TryFunc is what gets executed between retry wait periods during execution +// of Backoff. The keepRetrying return value is used to control whether a retry +// attempt should be made. This feature is useful in manipulating control flow +// in cases where it is known a retry will not be successful. +type TryFunc func() (keepRetrying bool, err error) + +// BackoffOptions allow for fine-tuning backoff behavior. +type BackoffOptions struct { + // MaxSleepTime represents the maximum amount of time + // the exponential backoff system will spend sleeping, + // accumulating the amount of time spent asleep between + // retries. + // + // The algorithm starts at an interval of InitialGapSize + // and increases exponentially (x2 each iteration) from there. + // With no jitter, a MaxSleepTime of 10 seconds and InitialGapSize + // of 1 millisecond would suggest a total of 15 attempts + // (since the very last retry truncates the sleep time to + // align exactly with MaxSleepTime). + MaxSleepTime time.Duration + + // InitialGapSize sets the initial amount of time the algorithm + // will sleep before the first retry (after the first attempt). + // The actual amount of sleep time will include a random amount + // of jitter, if MaxJitterSize is non-zero. + InitialGapSize time.Duration + + // MaxJitterSize limits how much randomness we may + // introduce in the duration of each retry interval. + // The purpose of introducing jitter is to mitigate the + // effect of thundering herds + MaxJitterSize time.Duration + + // RandomSeed is used for generating a randomly computed + // jitter size for each retry. + RandomSeed int64 + + // Sleeper is used to cause the process to sleep for + // a computed amount of time. If not set, a default + // implementation based on time.Sleep will be used. + Sleeper Sleeper +} + +// A Sleeper is a useful way for calling time.Sleep +// in a mock-able way for tests. +type Sleeper func(time.Duration) + +// Backoff will attempt to execute function using a configurable +// exponential backoff algorithm. function is a TryFunc which requires +// two return parameters - a boolean for optimizing control flow, and +// an error for reporting failure conditions. If the first parameter is +// false, the backoff algorithm will abandon further retry attempts and +// simply return an error. Otherwise, if the returned error is non-nil, the +// backoff algorithm will sleep for an increasing amount of time, and +// then retry again later, until the maximum amount of sleep time has +// been consumed. Once function has executed successfully with no error, +// the backoff algorithm returns a nil error. +func Backoff(function TryFunc, options BackoffOptions) error { + if options.MaxSleepTime <= 0 { + panic("max sleep time must be > 0") + } + + if options.InitialGapSize <= 0 { + panic("initial gap size must be > 0") + } + + if options.MaxJitterSize < 0 { + panic("max jitter size must be >= 0") + } + + if options.MaxJitterSize > (options.MaxSleepTime / 2) { + panic("max jitter size is way too large") + } + + if options.Sleeper == nil { + options.Sleeper = time.Sleep + } + + consumed := time.Duration(0) + gap := options.InitialGapSize + random := rand.New(rand.NewSource(options.RandomSeed)) + + for consumed < options.MaxSleepTime { + keepRetrying, err := function() + if err != nil && !keepRetrying { + return fmt.Errorf("exponential backoff instructed to stop retrying: %w", err) + } + + // we can ignore keepRetrying at this point, since we know + // what to do based on err + if err == nil { + return nil // success + } + + // there was an error, and function wants to keep retrying + // we will sleep, and then let the loop continue + // + // (random.Float64 returns a value [0.0, 1.0), which is used to + // randomly scale the jitter from 0 to MaxJitterSize. + jitter := nextJitter(random.Float64(), options.MaxJitterSize) + duration := gap + jitter + + if (duration + consumed) > options.MaxSleepTime { + // this will be our last try, force the duration + // to line up with the maximum sleep time + duration = options.MaxSleepTime - consumed + } + + // sleep for the configured duration + options.Sleeper(duration) + + // account for how long we intended to sleep + consumed += duration + + // exponentially increase the gap + gap *= 2 + } + + return ErrMaximumTimeExceeded +} + +func nextJitter(fraction float64, maxSize time.Duration) time.Duration { + scaled := fraction * float64(maxSize) + return time.Duration(scaled) +}