From 0fcb0d4016dbe49250a5ceca19a3a2df2f28125b Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Fri, 1 Nov 2019 07:15:11 -0700 Subject: [PATCH] client: fix panic from 0.8 -> 0.10 upgrade makeAllocTaskServices did not do a nil check on AllocatedResources which causes a panic when upgrading directly from 0.8 to 0.10. While skipping 0.9 is not supported we intend to fix serious crashers caused by such upgrades to prevent cluster outages. I did a quick audit of the client package and everywhere else that accesses AllocatedResources appears to be properly guarded by a nil check. --- client/allocrunner/taskrunner/service_hook.go | 6 +- client/allocrunner/taskrunner/task_runner.go | 2 +- command/agent/consul/client.go | 9 ++- command/agent/consul/group_test.go | 81 +++++++++++++++++++ 4 files changed, 94 insertions(+), 4 deletions(-) diff --git a/client/allocrunner/taskrunner/service_hook.go b/client/allocrunner/taskrunner/service_hook.go index edea8673cbf..94bbcb23b96 100644 --- a/client/allocrunner/taskrunner/service_hook.go +++ b/client/allocrunner/taskrunner/service_hook.go @@ -63,7 +63,8 @@ func newServiceHook(c serviceHookConfig) *serviceHook { delay: c.task.ShutdownDelay, } - // COMPAT(0.10): Just use the AllocatedResources + // COMPAT(0.11): AllocatedResources was added in 0.9 so assume its set + // in 0.11. if c.alloc.AllocatedResources != nil { if res := c.alloc.AllocatedResources.Tasks[c.task.Name]; res != nil { h.networks = res.Networks @@ -115,7 +116,8 @@ func (h *serviceHook) Update(ctx context.Context, req *interfaces.TaskUpdateRequ canary = req.Alloc.DeploymentStatus.Canary } - // COMPAT(0.10): Just use the AllocatedResources + // COMPAT(0.11): AllocatedResources was added in 0.9 so assume its set + // in 0.11. var networks structs.Networks if req.Alloc.AllocatedResources != nil { if res := req.Alloc.AllocatedResources.Tasks[h.taskName]; res != nil { diff --git a/client/allocrunner/taskrunner/task_runner.go b/client/allocrunner/taskrunner/task_runner.go index 454c682b609..314d79c2faa 100644 --- a/client/allocrunner/taskrunner/task_runner.go +++ b/client/allocrunner/taskrunner/task_runner.go @@ -301,7 +301,7 @@ func NewTaskRunner(config *Config) (*TaskRunner, error) { } tr.taskResources = tres } else { - // COMPAT(0.10): Upgrade from old resources to new resources + // COMPAT(0.11): Upgrade from 0.8 resources to 0.9+ resources // Grab the old task resources oldTr, ok := tr.alloc.TaskResources[tr.taskName] if !ok { diff --git a/command/agent/consul/client.go b/command/agent/consul/client.go index 9224fba1f68..cfa219d8203 100644 --- a/command/agent/consul/client.go +++ b/command/agent/consul/client.go @@ -805,7 +805,9 @@ func (noopRestarter) Restart(context.Context, *structs.TaskEvent, bool) error { // //TODO(schmichael) rename TaskServices and refactor this into a New method func makeAllocTaskServices(alloc *structs.Allocation, tg *structs.TaskGroup) (*TaskServices, error) { - if n := len(alloc.AllocatedResources.Shared.Networks); n == 0 { + //COMPAT(0.11) AllocatedResources is only nil when upgrading directly + // from 0.8. + if alloc.AllocatedResources == nil || len(alloc.AllocatedResources.Shared.Networks) == 0 { return nil, fmt.Errorf("unable to register a group service without a group network") } @@ -867,6 +869,11 @@ func (c *ServiceClient) UpdateGroup(oldAlloc, newAlloc *structs.Allocation) erro return fmt.Errorf("task group %q not in old allocation", oldAlloc.TaskGroup) } + if len(oldTG.Services) == 0 { + // No old group services, simply add new group services + return c.RegisterGroup(newAlloc) + } + oldServices, err := makeAllocTaskServices(oldAlloc, oldTG) if err != nil { return err diff --git a/command/agent/consul/group_test.go b/command/agent/consul/group_test.go index 40f5c84fa5e..6f9fe0e215d 100644 --- a/command/agent/consul/group_test.go +++ b/command/agent/consul/group_test.go @@ -127,3 +127,84 @@ func TestConsul_Connect(t *testing.T) { time.Sleep(interval >> 2) } } + +// TestConsul_Update08Alloc asserts that adding group services to a previously +// 0.8 alloc works. +// +// COMPAT(0.11) Only valid for upgrades from 0.8. +func TestConsul_Update08Alloc(t *testing.T) { + // Create an embedded Consul server + testconsul, err := testutil.NewTestServerConfig(func(c *testutil.TestServerConfig) { + // If -v wasn't specified squelch consul logging + if !testing.Verbose() { + c.Stdout = ioutil.Discard + c.Stderr = ioutil.Discard + } + }) + if err != nil { + t.Fatalf("error starting test consul server: %v", err) + } + defer testconsul.Stop() + + consulConfig := consulapi.DefaultConfig() + consulConfig.Address = testconsul.HTTPAddr + consulClient, err := consulapi.NewClient(consulConfig) + require.NoError(t, err) + serviceClient := NewServiceClient(consulClient.Agent(), testlog.HCLogger(t), true) + + // Lower periodicInterval to ensure periodic syncing doesn't improperly + // remove Connect services. + const interval = 50 * time.Millisecond + serviceClient.periodicInterval = interval + + // Disable deregistration probation to test syncing + serviceClient.deregisterProbationExpiry = time.Time{} + + go serviceClient.Run() + defer serviceClient.Shutdown() + + // Create new 0.10-style alloc + alloc := mock.Alloc() + alloc.AllocatedResources.Shared.Networks = []*structs.NetworkResource{ + { + Mode: "bridge", + IP: "10.0.0.1", + DynamicPorts: []structs.Port{ + { + Label: "connect-proxy-testconnect", + Value: 9999, + To: 9998, + }, + }, + }, + } + tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup) + tg.Services = []*structs.Service{ + { + Name: "testconnect", + PortLabel: "9999", + Connect: &structs.ConsulConnect{ + SidecarService: &structs.ConsulSidecarService{ + Proxy: &structs.ConsulProxy{ + LocalServicePort: 9000, + }, + }, + }, + }, + } + + // Create old 0.8-style alloc from new alloc + oldAlloc := alloc.Copy() + oldAlloc.AllocatedResources = nil + oldAlloc.Job.LookupTaskGroup(alloc.TaskGroup).Services = nil + + // Expect new services to get registered + require.NoError(t, serviceClient.UpdateGroup(oldAlloc, alloc)) + + // Assert the group and sidecar services are registered + require.Eventually(t, func() bool { + services, err := consulClient.Agent().Services() + require.NoError(t, err) + return len(services) == 2 + }, 3*time.Second, 100*time.Millisecond) +}