From b2766af9c3c538cf7698e926a9c5ca10c5e13dc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20T=C3=B6lle?= Date: Mon, 11 Dec 2023 09:38:35 +0100 Subject: [PATCH] fix(hetzner): insufficient nodes when boot fails The Hetzner Cloud API returns "Actions" for anything asynchronous that happens inside the backend. When creating a new server multiple actions are returned: `create_server`, `start_server`, `attach_to_network` (if set). Our current code waits for the `create_server` and if it fails, it makes sure to delete the server so cluster-autoscaler can create a new one immediately to provide the required capacity. If one of the "follow up" actions fails though, we do not handle this. This causes issues when the server for whatever reason did not start properly on the first try, as then the customer has a shutdown server, is paying for it, but does not receive the additional capacity for their Kubernetes cluster. This commit fixes the bug, by awaiting all actions returned by the create server API call, and deleting the server if any of them fail. --- .../cloudprovider/hetzner/hetzner_node_group.go | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group.go b/cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group.go index e644f87c9d36..4440ca2a226d 100644 --- a/cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group.go +++ b/cluster-autoscaler/cloudprovider/hetzner/hetzner_node_group.go @@ -385,12 +385,18 @@ func createServer(n *hetznerNodeGroup) error { return fmt.Errorf("could not create server type %s in region %s: %v", n.instanceType, n.region, err) } - action := serverCreateResult.Action server := serverCreateResult.Server - err = waitForServerAction(n.manager, server.Name, action) - if err != nil { - _ = n.manager.deleteServer(server) - return fmt.Errorf("failed to start server %s error: %v", server.Name, err) + + actions := []*hcloud.Action{serverCreateResult.Action} + actions = append(actions, serverCreateResult.NextActions...) + + // Delete the server if any action (most importantly create_server & start_server) fails + for _, action := range actions { + err = waitForServerAction(n.manager, server.Name, action) + if err != nil { + _ = n.manager.deleteServer(server) + return fmt.Errorf("failed to start server %s error: %v", server.Name, err) + } } return nil