Skip to content

Commit

Permalink
Provide fake proivder IDs for failed machines
Browse files Browse the repository at this point in the history
  • Loading branch information
JoelSpeed authored and detiber committed Jun 2, 2020
1 parent 38df5c6 commit b898c1f
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"fmt"
"os"
"strings"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -46,6 +47,7 @@ const (
resourceNameMachine = "machines"
resourceNameMachineSet = "machinesets"
resourceNameMachineDeployment = "machinedeployments"
failedMachinePrefix = "failed-machine-"
)

// machineController watches for Nodes, Machines, MachineSets and
Expand Down Expand Up @@ -219,6 +221,16 @@ func (c *machineController) findMachineByProviderID(providerID normalizedProvide
}
}

if isFailedMachineProviderID(providerID) {
machine, err := c.findMachine(machineKeyFromFailedProviderID(providerID))
if err != nil {
return nil, err
}
if machine != nil {
return machine.DeepCopy(), nil
}
}

// If the machine object has no providerID--maybe actuator
// does not set this value (e.g., OpenStack)--then first
// lookup the node using ProviderID. If that is successful
Expand All @@ -234,6 +246,15 @@ func (c *machineController) findMachineByProviderID(providerID normalizedProvide
return c.findMachine(node.Annotations[machineAnnotationKey])
}

func isFailedMachineProviderID(providerID normalizedProviderID) bool {
return strings.HasPrefix(string(providerID), failedMachinePrefix)
}

func machineKeyFromFailedProviderID(providerID normalizedProviderID) string {
namespaceName := strings.TrimPrefix(string(providerID), failedMachinePrefix)
return strings.Replace(namespaceName, "_", "/", 1)
}

// findNodeByNodeName finds the Node object keyed by name.. Returns
// nil if it cannot be found. A DeepCopy() of the object is returned
// on success.
Expand Down Expand Up @@ -393,6 +414,17 @@ func (c *machineController) machineSetProviderIDs(machineSet *MachineSet) ([]str
continue
}

if machine.Status.FailureMessage != nil {
klog.V(4).Infof("Status.FailureMessage of machine %q is %q", machine.Name, *machine.Status.FailureMessage)
// Provide a fake ID to allow the autoscaler to track machines that will never
// become nodes and mark the nodegroup unhealthy after maxNodeProvisionTime.
// Fake ID needs to be recognised later and converted into a machine key.
// Use an underscore as a separator between namespace and name as it is not a
// valid character within a namespace name.
providerIDs = append(providerIDs, fmt.Sprintf("%s%s_%s", failedMachinePrefix, machine.Namespace, machine.Name))
continue
}

if machine.Status.NodeRef == nil {
klog.V(4).Infof("Status.NodeRef of machine %q is currently nil", machine.Name)
continue
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -450,7 +450,17 @@ func TestControllerFindMachineByProviderID(t *testing.T) {
t.Fatalf("expected machines to be equal - expected %+v, got %+v", testConfig.machines[0], machine)
}

// Test #2: Verify machine is not found if it has a
// Test #2: Verify machine returned by fake provider ID is correct machine
fakeProviderID := fmt.Sprintf("%s$s/%s", testConfig.machines[0].Namespace, testConfig.machines[0].Name)
machine, err = controller.findMachineByProviderID(normalizedProviderID(fakeProviderID))
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if machine != nil {
t.Fatal("expected find to fail")
}

// Test #3: Verify machine is not found if it has a
// non-existent or different provider ID.
machine = testConfig.machines[0].DeepCopy()
machine.Spec.ProviderID = pointer.StringPtr("does-not-match")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,10 @@ func newMachineFromUnstructured(u *unstructured.Unstructured) *Machine {
machine.Status.NodeRef = &nodeRef
}

if failureMessage, _, _ := unstructured.NestedString(u.Object, "status", "failureMessage"); failureMessage != "" {
machine.Status.FailureMessage = pointer.StringPtr(failureMessage)
}

return &machine
}

Expand Down Expand Up @@ -184,5 +188,9 @@ func newUnstructuredFromMachine(m *Machine) *unstructured.Unstructured {
}
}

if m.Status.FailureMessage != nil {
unstructured.SetNestedField(u.Object, *m.Status.FailureMessage, "status", "failureMessage")
}

return &u
}
19 changes: 19 additions & 0 deletions cluster-autoscaler/cloudprovider/clusterapi/machine_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,25 @@ type MachineStatus struct {
// NodeRef will point to the corresponding Node if it exists.
// +optional
NodeRef *corev1.ObjectReference `json:"nodeRef,omitempty"`

// FailureMessage will be set in the event that there is a terminal problem
// reconciling the Machine and will contain a more verbose string suitable
// for logging and human consumption.
//
// This field should not be set for transitive errors that a controller
// faces that are expected to be fixed automatically over
// time (like service outages), but instead indicate that something is
// fundamentally wrong with the Machine's spec or the configuration of
// the controller, and that manual intervention is required. Examples
// of terminal errors would be invalid combinations of settings in the
// spec, values that are unsupported by the controller, or the
// responsible controller itself being critically misconfigured.
//
// Any transient errors that occur during the reconciliation of Machines
// can be added as events to the Machine object and/or logged in the
// controller's output.
// +optional
FailureMessage *string `json:"failureMessage,omitempty"`
}

// MachineList contains a list of Machine
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit b898c1f

Please sign in to comment.