Skip to content

Commit

Permalink
connect: use exp backoff when waiting on consul envoy bootstrap
Browse files Browse the repository at this point in the history
This PR wraps the use of the consul envoy bootstrap command in
an expoenential backoff closure, configured to timeout after 60
seconds. This is an increase over the current behavior of making
3 attempts over 6 seconds.

Should help with #10451
  • Loading branch information
shoenig committed Apr 27, 2021
1 parent accf39a commit 6884455
Show file tree
Hide file tree
Showing 5 changed files with 336 additions and 46 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ IMPROVEMENTS:
* client/config: Enable sockaddr templating for `network-interface` attribute. [[GH-10404](https://github.com/hashicorp/nomad/issues/10404)]
* client/fingerprint: Added support multiple host network aliases for the same interface. [[GH-10104](https://github.com/hashicorp/nomad/issues/10104)]
* consul: Allow setting `body` field on service/check Consul health checks. [[GH-10186](https://github.com/hashicorp/nomad/issues/10186)]
* consul/connect: Use exponential backoff for consul envoy bootstrap process [[GH-10453](https://github.com/hashicorp/nomad/pull/10453)]
* consul/connect: Enable setting `local_bind_address` field on connect upstreams [[GH-6248](https://github.com/hashicorp/nomad/issues/6248)]
* consul/connect: Automatically populate `CONSUL_HTTP_ADDR` for connect native tasks in host networking mode. [[GH-10239](https://github.com/hashicorp/nomad/issues/10239)]
* csi: Added support for jobs to request a unique volume ID per allocation. [[GH-10136](https://github.com/hashicorp/nomad/issues/10136)]
Expand Down
129 changes: 83 additions & 46 deletions client/allocrunner/taskrunner/envoy_bootstrap_hook.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,29 @@ import (
"github.com/hashicorp/nomad/client/taskenv"
agentconsul "github.com/hashicorp/nomad/command/agent/consul"
"github.com/hashicorp/nomad/helper"
"github.com/hashicorp/nomad/helper/exptime"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/nomad/structs/config"
"github.com/pkg/errors"
)

const envoyBootstrapHookName = "envoy_bootstrap"

const (
// envoyBootstrapWaitTime is the amount of time this hook should wait on Consul
// objects to be created before giving up.
envoyBootstrapWaitTime = 60 * time.Second

// envoyBootstrapInitialGap is the initial amount of time the envoy bootstrap
// retry loop will wait, exponentially increasing each iteration, not including
// jitter.
envoyBoostrapInitialGap = 1 * time.Second

// envoyBootstrapMaxJitter is the maximum amount of jitter applied to the
// wait gap each iteration of the envoy bootstrap retry loop.
envoyBootstrapMaxJitter = 500 * time.Millisecond
)

type consulTransportConfig struct {
HTTPAddr string // required
Auth string // optional, env CONSUL_HTTP_AUTH
Expand Down Expand Up @@ -100,16 +116,32 @@ type envoyBootstrapHook struct {
// consulNamespace is the Consul namespace as set by in the job
consulNamespace string

// envoyBootstrapWaitTime is the total amount of time hook will wait for Consul
envoyBootstrapWaitTime time.Duration

// envoyBootstrapInitialGap is the initial wait gap when retyring
envoyBoostrapInitialGap time.Duration

// envoyBootstrapMaxJitter is the maximum amount of jitter applied to retries
envoyBootstrapMaxJitter time.Duration

// envoyBootstrapExpSleep controls exponential waiting
envoyBootstrapExpSleep func(time.Duration)

// logger is used to log things
logger hclog.Logger
}

func newEnvoyBootstrapHook(c *envoyBootstrapHookConfig) *envoyBootstrapHook {
return &envoyBootstrapHook{
alloc: c.alloc,
consulConfig: c.consul,
consulNamespace: c.consulNamespace,
logger: c.logger.Named(envoyBootstrapHookName),
alloc: c.alloc,
consulConfig: c.consul,
consulNamespace: c.consulNamespace,
envoyBootstrapWaitTime: envoyBootstrapWaitTime,
envoyBoostrapInitialGap: envoyBoostrapInitialGap,
envoyBootstrapMaxJitter: envoyBootstrapMaxJitter,
envoyBootstrapExpSleep: time.Sleep,
logger: c.logger.Named(envoyBootstrapHookName),
}
}

Expand Down Expand Up @@ -221,66 +253,71 @@ func (h *envoyBootstrapHook) Prestart(ctx context.Context, req *ifs.TaskPrestart
bootstrapArgs := bootstrap.args()
bootstrapEnv := bootstrap.env(os.Environ())

// keep track of latest error returned from exec-ing consul envoy bootstrap
var cmdErr error

// Since Consul services are registered asynchronously with this task
// hook running, retry a small number of times with backoff.
for tries := 3; ; tries-- {
// hook running, retry until timeout or success.
if backoffErr := exptime.Backoff(func() (bool, error) {

// If hook is killed, just stop.
select {
case <-ctx.Done():
return false, nil
default:
}

// Prepare bootstrap command to run.
cmd := exec.CommandContext(ctx, "consul", bootstrapArgs...)
cmd.Env = bootstrapEnv

// Redirect output to secrets/envoy_bootstrap.json
fd, err := os.Create(bootstrapFilePath)
if err != nil {
return fmt.Errorf("error creating secrets/envoy_bootstrap.json for envoy: %v", err)
// Redirect stdout to secrets/envoy_bootstrap.json.
fd, fileErr := os.Create(bootstrapFilePath)
if fileErr != nil {
return false, fmt.Errorf("failed to create secrets/envoy_bootstrap.json for envoy: %w", fileErr)
}
cmd.Stdout = fd

// Redirect stderr into a buffer for later reading.
buf := bytes.NewBuffer(nil)
cmd.Stderr = buf

// Generate bootstrap
err = cmd.Run()
cmdErr = cmd.Run()

// Close bootstrap.json
fd.Close()
// Close bootstrap.json regardless of any command errors.
_ = fd.Close()

if err == nil {
// Happy path! Bootstrap was created, exit.
break
// Command succeeded, exit.
if cmdErr == nil {
// Bootstrap written. Mark as done and move on.
resp.Done = true
return false, nil
}

// Check for error from command
if tries == 0 {
h.logger.Error("error creating bootstrap configuration for Connect proxy sidecar", "error", err, "stderr", buf.String())

// Cleanup the bootstrap file. An errors here is not
// important as (a) we test to ensure the deletion
// occurs, and (b) the file will either be rewritten on
// retry or eventually garbage collected if the task
// fails.
os.Remove(bootstrapFilePath)

// ExitErrors are recoverable since they indicate the
// command was runnable but exited with a unsuccessful
// error code.
_, recoverable := err.(*exec.ExitError)
return structs.NewRecoverableError(
fmt.Errorf("error creating bootstrap configuration for Connect proxy sidecar: %v", err),
recoverable,
)
}

// Sleep before retrying to give Consul services time to register
select {
case <-time.After(2 * time.Second):
case <-ctx.Done():
// Killed before bootstrap, exit without setting Done
return nil
}
// Command failed, prepare for retry
//
// Cleanup the bootstrap file. An errors here is not
// important as (a) we test to ensure the deletion
// occurs, and (b) the file will either be rewritten on
// retry or eventually garbage collected if the task
// fails.
_ = os.Remove(bootstrapFilePath)

return true, cmdErr
}, exptime.BackoffOptions{
MaxSleepTime: h.envoyBootstrapWaitTime,
InitialGapSize: h.envoyBoostrapInitialGap,
MaxJitterSize: h.envoyBootstrapMaxJitter,
}); backoffErr != nil {
// Wrap the last error from Consul and set that as our status.
_, recoverable := cmdErr.(*exec.ExitError)
return structs.NewRecoverableError(
fmt.Errorf("error creating bootstrap configuration for Connect proxy sidecar: %v", cmdErr),
recoverable,
)
}

// Bootstrap written. Mark as done and move on.
resp.Done = true
return nil
}

Expand Down
99 changes: 99 additions & 0 deletions client/allocrunner/taskrunner/envoy_bootstrap_hook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"os"
"path/filepath"
"testing"
"time"

consulapi "github.com/hashicorp/consul/api"
"github.com/hashicorp/nomad/client/allocdir"
Expand Down Expand Up @@ -625,6 +626,11 @@ func TestTaskRunner_EnvoyBootstrapHook_RecoverableError(t *testing.T) {
h := newEnvoyBootstrapHook(newEnvoyBootstrapHookConfig(alloc, &config.ConsulConfig{
Addr: testConsul.HTTPAddr,
}, consulNamespace, logger))

// Lower the allowable wait time for testing
h.envoyBootstrapWaitTime = 1 * time.Second
h.envoyBoostrapInitialGap = 100 * time.Millisecond

req := &interfaces.TaskPrestartRequest{
Task: sidecarTask,
TaskDir: allocDir.NewTaskDir(sidecarTask.Name),
Expand All @@ -648,6 +654,99 @@ func TestTaskRunner_EnvoyBootstrapHook_RecoverableError(t *testing.T) {
require.True(t, os.IsNotExist(err))
}

func TestTaskRunner_EnvoyBootstrapHook_retryTimeout(t *testing.T) {
t.Parallel()
logger := testlog.HCLogger(t)

testConsul := getTestConsul(t)
defer testConsul.Stop()

begin := time.Now()

// Setup an Allocation
alloc := mock.ConnectAlloc()
alloc.AllocatedResources.Shared.Networks = []*structs.NetworkResource{
{
Mode: "bridge",
IP: "10.0.0.1",
DynamicPorts: []structs.Port{
{
Label: "connect-proxy-foo",
Value: 9999,
To: 9999,
},
},
},
}
tg := alloc.Job.TaskGroups[0]
tg.Services = []*structs.Service{
{
Name: "foo",
PortLabel: "9999", // Just need a valid port, nothing will bind to it
Connect: &structs.ConsulConnect{
SidecarService: &structs.ConsulSidecarService{},
},
},
}
sidecarTask := &structs.Task{
Name: "sidecar",
Kind: structs.NewTaskKind(structs.ConnectProxyPrefix, "foo"),
}
tg.Tasks = append(tg.Tasks, sidecarTask)
allocDir, cleanupAlloc := allocdir.TestAllocDir(t, logger, "EnvoyBootstrapRetryTimeout")
defer cleanupAlloc()

// Get a Consul client
consulConfig := consulapi.DefaultConfig()
consulConfig.Address = testConsul.HTTPAddr

// Do NOT register group services, causing the hook to retry until timeout

// Run Connect bootstrap hook
h := newEnvoyBootstrapHook(newEnvoyBootstrapHookConfig(alloc, &config.ConsulConfig{
Addr: consulConfig.Address,
}, consulNamespace, logger))

// Keep track of the retry backoff iterations
iterations := 0

// Lower the allowable wait time for testing
h.envoyBootstrapWaitTime = 3 * time.Second
h.envoyBoostrapInitialGap = 1 * time.Second
h.envoyBootstrapExpSleep = func(d time.Duration) {
iterations++
time.Sleep(d)
}

// Create the prestart request
req := &interfaces.TaskPrestartRequest{
Task: sidecarTask,
TaskDir: allocDir.NewTaskDir(sidecarTask.Name),
TaskEnv: taskenv.NewEmptyTaskEnv(),
}
require.NoError(t, req.TaskDir.Build(false, nil))

var resp interfaces.TaskPrestartResponse

// Run the hook and get the error
err := h.Prestart(context.Background(), req, &resp)
require.EqualError(t, err, "error creating bootstrap configuration for Connect proxy sidecar: exit status 1")

// Current time should be at least start time + total wait time
minimum := begin.Add(h.envoyBootstrapWaitTime)
require.True(t, time.Now().After(minimum))

// Should hit at least 2 iterations
require.Greater(t, 2, iterations)

// Make sure we captured the recoverable-ness of the error
_, ok := err.(*structs.RecoverableError)
require.True(t, ok)

// Assert the hook is not done (it failed)
require.False(t, resp.Done)
}

func TestTaskRunner_EnvoyBootstrapHook_extractNameAndKind(t *testing.T) {
t.Run("connect sidecar", func(t *testing.T) {
kind, name, err := (*envoyBootstrapHook)(nil).extractNameAndKind(
Expand Down
11 changes: 11 additions & 0 deletions helper/exptime/LICENSE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
Copyright (c) 2019 The Indeed Engineering Authors. All rights reserved.

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Loading

0 comments on commit 6884455

Please sign in to comment.