diff --git a/.changelog/19787.txt b/.changelog/19787.txt new file mode 100644 index 00000000000..3f05b830a77 --- /dev/null +++ b/.changelog/19787.txt @@ -0,0 +1,3 @@ +```release-note:bug +connect: Fixed envoy sidecars being unable to restart after node reboots +``` diff --git a/client/allocrunner/interfaces/task_lifecycle.go b/client/allocrunner/interfaces/task_lifecycle.go index 21f0f8992f2..3c3245bbeea 100644 --- a/client/allocrunner/interfaces/task_lifecycle.go +++ b/client/allocrunner/interfaces/task_lifecycle.go @@ -87,6 +87,15 @@ type TaskPrestartResponse struct { // Done lets the hook indicate that it completed successfully and // should not be run again. + // + // Use sparringly! In general hooks should be idempotent and therefore Done + // is unneeded. You never know at what point an agent might crash, and it can + // be hard to reason about how Done=true impacts agent restarts and node + // reboots. See #19787 for example. + // + // Done is useful for expensive operations such as downloading artifacts, or + // for operations which might fail needlessly if rerun while a node is + // disconnected. Done bool } diff --git a/client/allocrunner/taskrunner/connect_native_hook.go b/client/allocrunner/taskrunner/connect_native_hook.go index 6acd8861eb8..8df4e1f41a1 100644 --- a/client/allocrunner/taskrunner/connect_native_hook.go +++ b/client/allocrunner/taskrunner/connect_native_hook.go @@ -119,8 +119,8 @@ func (h *connectNativeHook) Prestart( merge(environment, h.bridgeEnv(request.TaskEnv.EnvMap)) merge(environment, h.hostEnv(request.TaskEnv.EnvMap)) - // tls/acl setup for native task done - response.Done = true + // tls/acl setup for native task done but since SecretsDir is a tmpfs, don't + // mark Done=true as this hook will need to rerun on node reboots response.Env = environment return nil } diff --git a/client/allocrunner/taskrunner/connect_native_hook_test.go b/client/allocrunner/taskrunner/connect_native_hook_test.go index 313d42ca0c9..a3e7615b7b0 100644 --- a/client/allocrunner/taskrunner/connect_native_hook_test.go +++ b/client/allocrunner/taskrunner/connect_native_hook_test.go @@ -287,9 +287,6 @@ func TestTaskRunner_ConnectNativeHook_Noop(t *testing.T) { // Run the hook require.NoError(t, h.Prestart(context.Background(), request, response)) - // Assert the hook is Done - require.True(t, response.Done) - // Assert no environment variables configured to be set require.Empty(t, response.Env) @@ -352,9 +349,6 @@ func TestTaskRunner_ConnectNativeHook_Ok(t *testing.T) { // Run the Connect Native hook require.NoError(t, h.Prestart(context.Background(), request, response)) - // Assert the hook is Done - require.True(t, response.Done) - // Assert only CONSUL_HTTP_ADDR env variable is set require.Equal(t, map[string]string{"CONSUL_HTTP_ADDR": testConsul.HTTPAddr}, response.Env) @@ -424,9 +418,6 @@ func TestTaskRunner_ConnectNativeHook_with_SI_token(t *testing.T) { // Run the Connect Native hook require.NoError(t, h.Prestart(context.Background(), request, response)) - // Assert the hook is Done - require.True(t, response.Done) - // Assert environment variable for token is set require.NotEmpty(t, response.Env) require.Equal(t, token, response.Env["CONSUL_HTTP_TOKEN"]) @@ -504,9 +495,6 @@ func TestTaskRunner_ConnectNativeHook_shareTLS(t *testing.T) { // Run the Connect Native hook require.NoError(t, h.Prestart(context.Background(), request, response)) - // Assert the hook is Done - require.True(t, response.Done) - // Remove variables we are not interested in delete(response.Env, "CONSUL_HTTP_ADDR") @@ -634,9 +622,6 @@ func TestTaskRunner_ConnectNativeHook_shareTLS_override(t *testing.T) { // Run the Connect Native hook require.NoError(t, h.Prestart(context.Background(), request, response)) - // Assert the hook is Done - require.True(t, response.Done) - // Assert environment variable for CONSUL_HTTP_SSL is set, because it was // the only one not overridden by task env block config require.NotEmpty(t, response.Env) diff --git a/client/allocrunner/taskrunner/envoy_bootstrap_hook.go b/client/allocrunner/taskrunner/envoy_bootstrap_hook.go index 10fc146016d..483157af203 100644 --- a/client/allocrunner/taskrunner/envoy_bootstrap_hook.go +++ b/client/allocrunner/taskrunner/envoy_bootstrap_hook.go @@ -235,8 +235,6 @@ func (h *envoyBootstrapHook) lookupService(svcKind, svcName string, taskEnv *tas // Must be aware of both launching envoy as a sidecar proxy, as well as a connect gateway. func (h *envoyBootstrapHook) Prestart(ctx context.Context, req *ifs.TaskPrestartRequest, resp *ifs.TaskPrestartResponse) error { if !req.Task.Kind.IsConnectProxy() && !req.Task.Kind.IsAnyConnectGateway() { - // Not a Connect proxy sidecar - resp.Done = true return nil } @@ -358,8 +356,8 @@ func (h *envoyBootstrapHook) Prestart(ctx context.Context, req *ifs.TaskPrestart // Command succeeded, exit. if cmdErr == nil { - // Bootstrap written. Mark as done and move on. - resp.Done = true + // Bootstrap written. Move on without marking as Done as Prestart needs + // to rerun after node reboots. return false, nil } diff --git a/client/allocrunner/taskrunner/envoy_bootstrap_hook_test.go b/client/allocrunner/taskrunner/envoy_bootstrap_hook_test.go index b268f921bb1..f6caa50821e 100644 --- a/client/allocrunner/taskrunner/envoy_bootstrap_hook_test.go +++ b/client/allocrunner/taskrunner/envoy_bootstrap_hook_test.go @@ -368,9 +368,6 @@ func TestEnvoyBootstrapHook_with_SI_token(t *testing.T) { // Run the hook require.NoError(t, h.Prestart(context.Background(), req, resp)) - // Assert it is Done - require.True(t, resp.Done) - // Ensure the default path matches env := map[string]string{ taskenv.SecretsDir: req.TaskDir.SecretsDir, @@ -463,9 +460,6 @@ func TestTaskRunner_EnvoyBootstrapHook_sidecar_ok(t *testing.T) { // Run the hook require.NoError(t, h.Prestart(context.Background(), req, resp)) - // Assert it is Done - require.True(t, resp.Done) - require.NotNil(t, resp.Env) require.Equal(t, "127.0.0.2:19001", resp.Env[envoyAdminBindEnvPrefix+"foo"]) @@ -547,7 +541,6 @@ func TestTaskRunner_EnvoyBootstrapHook_gateway_ok(t *testing.T) { require.NoError(t, h.Prestart(context.Background(), req, &resp)) // Assert the hook is Done - require.True(t, resp.Done) require.NotNil(t, resp.Env) // Read the Envoy Config file @@ -596,9 +589,6 @@ func TestTaskRunner_EnvoyBootstrapHook_Noop(t *testing.T) { // Run the hook require.NoError(t, h.Prestart(context.Background(), req, resp)) - // Assert it is Done - require.True(t, resp.Done) - // Assert no file was written _, err := os.Open(filepath.Join(req.TaskDir.SecretsDir, "envoy_bootstrap.json")) require.Error(t, err) @@ -677,9 +667,6 @@ func TestTaskRunner_EnvoyBootstrapHook_RecoverableError(t *testing.T) { require.ErrorIs(t, err, errEnvoyBootstrapError) require.True(t, structs.IsRecoverable(err)) - // Assert it is not Done - require.False(t, resp.Done) - // Assert no file was written _, err = os.Open(filepath.Join(req.TaskDir.SecretsDir, "envoy_bootstrap.json")) require.Error(t, err)