Skip to content

Commit

Permalink
Merge pull request #3172 from detiber/backport-2983-v1.18
Browse files Browse the repository at this point in the history
[CA-1.18] #2983 cherry-pick: ClusterAPI Provider: Provide fake proivder IDs for failed Machines
  • Loading branch information
k8s-ci-robot authored Jun 3, 2020
2 parents 6d7a013 + 1a6bb0e commit c6d8539
Show file tree
Hide file tree
Showing 6 changed files with 209 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"fmt"
"os"
"strings"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -46,6 +47,7 @@ const (
resourceNameMachine = "machines"
resourceNameMachineSet = "machinesets"
resourceNameMachineDeployment = "machinedeployments"
failedMachinePrefix = "failed-machine-"
)

// machineController watches for Nodes, Machines, MachineSets and
Expand Down Expand Up @@ -219,6 +221,16 @@ func (c *machineController) findMachineByProviderID(providerID normalizedProvide
}
}

if isFailedMachineProviderID(providerID) {
machine, err := c.findMachine(machineKeyFromFailedProviderID(providerID))
if err != nil {
return nil, err
}
if machine != nil {
return machine.DeepCopy(), nil
}
}

// If the machine object has no providerID--maybe actuator
// does not set this value (e.g., OpenStack)--then first
// lookup the node using ProviderID. If that is successful
Expand All @@ -234,6 +246,15 @@ func (c *machineController) findMachineByProviderID(providerID normalizedProvide
return c.findMachine(node.Annotations[machineAnnotationKey])
}

func isFailedMachineProviderID(providerID normalizedProviderID) bool {
return strings.HasPrefix(string(providerID), failedMachinePrefix)
}

func machineKeyFromFailedProviderID(providerID normalizedProviderID) string {
namespaceName := strings.TrimPrefix(string(providerID), failedMachinePrefix)
return strings.Replace(namespaceName, "_", "/", 1)
}

// findNodeByNodeName finds the Node object keyed by name.. Returns
// nil if it cannot be found. A DeepCopy() of the object is returned
// on success.
Expand Down Expand Up @@ -393,6 +414,17 @@ func (c *machineController) machineSetProviderIDs(machineSet *MachineSet) ([]str
continue
}

if machine.Status.FailureMessage != nil {
klog.V(4).Infof("Status.FailureMessage of machine %q is %q", machine.Name, *machine.Status.FailureMessage)
// Provide a fake ID to allow the autoscaler to track machines that will never
// become nodes and mark the nodegroup unhealthy after maxNodeProvisionTime.
// Fake ID needs to be recognised later and converted into a machine key.
// Use an underscore as a separator between namespace and name as it is not a
// valid character within a namespace name.
providerIDs = append(providerIDs, fmt.Sprintf("%s%s_%s", failedMachinePrefix, machine.Namespace, machine.Name))
continue
}

if machine.Status.NodeRef == nil {
klog.V(4).Infof("Status.NodeRef of machine %q is currently nil", machine.Name)
continue
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -450,7 +450,17 @@ func TestControllerFindMachineByProviderID(t *testing.T) {
t.Fatalf("expected machines to be equal - expected %+v, got %+v", testConfig.machines[0], machine)
}

// Test #2: Verify machine is not found if it has a
// Test #2: Verify machine returned by fake provider ID is correct machine
fakeProviderID := fmt.Sprintf("%s$s/%s", testConfig.machines[0].Namespace, testConfig.machines[0].Name)
machine, err = controller.findMachineByProviderID(normalizedProviderID(fakeProviderID))
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if machine != nil {
t.Fatal("expected find to fail")
}

// Test #3: Verify machine is not found if it has a
// non-existent or different provider ID.
machine = testConfig.machines[0].DeepCopy()
machine.Spec.ProviderID = pointer.StringPtr("does-not-match")
Expand Down Expand Up @@ -1126,3 +1136,57 @@ func TestGetAPIGroupPreferredVersion(t *testing.T) {
})
}
}

func TestIsFailedMachineProviderID(t *testing.T) {
testCases := []struct {
name string
providerID normalizedProviderID
expected bool
}{
{
name: "with the failed machine prefix",
providerID: normalizedProviderID(fmt.Sprintf("%sfoo", failedMachinePrefix)),
expected: true,
},
{
name: "without the failed machine prefix",
providerID: normalizedProviderID("foo"),
expected: false,
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
if got := isFailedMachineProviderID(tc.providerID); got != tc.expected {
t.Errorf("test case: %s, expected: %v, got: %v", tc.name, tc.expected, got)
}
})
}
}

func TestMachineKeyFromFailedProviderID(t *testing.T) {
testCases := []struct {
name string
providerID normalizedProviderID
expected string
}{
{
name: "with a valid failed machine prefix",
providerID: normalizedProviderID(fmt.Sprintf("%stest-namespace_foo", failedMachinePrefix)),
expected: "test-namespace/foo",
},
{
name: "with a machine with an underscore in the name",
providerID: normalizedProviderID(fmt.Sprintf("%stest-namespace_foo_bar", failedMachinePrefix)),
expected: "test-namespace/foo_bar",
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
if got := machineKeyFromFailedProviderID(tc.providerID); got != tc.expected {
t.Errorf("test case: %s, expected: %q, got: %q", tc.name, tc.expected, got)
}
})
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,10 @@ func newMachineFromUnstructured(u *unstructured.Unstructured) *Machine {
machine.Status.NodeRef = &nodeRef
}

if failureMessage, _, _ := unstructured.NestedString(u.Object, "status", "failureMessage"); failureMessage != "" {
machine.Status.FailureMessage = pointer.StringPtr(failureMessage)
}

return &machine
}

Expand Down Expand Up @@ -184,5 +188,9 @@ func newUnstructuredFromMachine(m *Machine) *unstructured.Unstructured {
}
}

if m.Status.FailureMessage != nil {
unstructured.SetNestedField(u.Object, *m.Status.FailureMessage, "status", "failureMessage")
}

return &u
}
Original file line number Diff line number Diff line change
Expand Up @@ -943,3 +943,83 @@ func TestNodeGroupDeleteNodesTwice(t *testing.T) {
}))
})
}

func TestNodeGroupWithFailedMachine(t *testing.T) {
test := func(t *testing.T, testConfig *testConfig) {
controller, stop := mustCreateTestController(t, testConfig)
defer stop()

// Simulate a failed machine
machine := testConfig.machines[3].DeepCopy()
machine.Spec.ProviderID = nil
failureMessage := "FailureMessage"
machine.Status.FailureMessage = &failureMessage
if err := controller.machineInformer.Informer().GetStore().Update(newUnstructuredFromMachine(machine)); err != nil {
t.Fatalf("unexpected error updating machine, got %v", err)
}

nodegroups, err := controller.nodeGroups()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}

if l := len(nodegroups); l != 1 {
t.Fatalf("expected 1 nodegroup, got %d", l)
}

ng := nodegroups[0]
nodeNames, err := ng.Nodes()
if err != nil {
t.Fatalf("unexpected error: %v", err)
}

if len(nodeNames) != len(testConfig.nodes) {
t.Fatalf("expected len=%v, got len=%v", len(testConfig.nodes), len(nodeNames))
}

sort.SliceStable(nodeNames, func(i, j int) bool {
return nodeNames[i].Id < nodeNames[j].Id
})

// The failed machine key is sorted to the first index
failedMachineID := fmt.Sprintf("%s%s_%s", failedMachinePrefix, machine.Namespace, machine.Name)
if nodeNames[0].Id != failedMachineID {
t.Fatalf("expected %q, got %q", failedMachineID, nodeNames[0].Id)
}

for i := 1; i < len(nodeNames); i++ {
// Fix the indexing due the failed machine being removed from the list
var nodeIndex int
if i < 4 {
// for nodes 0, 1, 2
nodeIndex = i - 1
} else {
// for nodes 4 onwards
nodeIndex = i
}

if nodeNames[i].Id != string(normalizedProviderString(testConfig.nodes[nodeIndex].Spec.ProviderID)) {
t.Fatalf("expected %q, got %q", testConfig.nodes[nodeIndex].Spec.ProviderID, nodeNames[i].Id)
}
}
}

// Note: 10 is an upper bound for the number of nodes/replicas
// Going beyond 10 will break the sorting that happens in the
// test() function because sort.Strings() will not do natural
// sorting and the expected semantics in test() will fail.

t.Run("MachineSet", func(t *testing.T) {
test(t, createMachineSetTestConfig(testNamespace, 10, map[string]string{
nodeGroupMinSizeAnnotationKey: "1",
nodeGroupMaxSizeAnnotationKey: "10",
}))
})

t.Run("MachineDeployment", func(t *testing.T) {
test(t, createMachineDeploymentTestConfig(testNamespace, 10, map[string]string{
nodeGroupMinSizeAnnotationKey: "1",
nodeGroupMaxSizeAnnotationKey: "10",
}))
})
}
19 changes: 19 additions & 0 deletions cluster-autoscaler/cloudprovider/clusterapi/machine_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,25 @@ type MachineStatus struct {
// NodeRef will point to the corresponding Node if it exists.
// +optional
NodeRef *corev1.ObjectReference `json:"nodeRef,omitempty"`

// FailureMessage will be set in the event that there is a terminal problem
// reconciling the Machine and will contain a more verbose string suitable
// for logging and human consumption.
//
// This field should not be set for transitive errors that a controller
// faces that are expected to be fixed automatically over
// time (like service outages), but instead indicate that something is
// fundamentally wrong with the Machine's spec or the configuration of
// the controller, and that manual intervention is required. Examples
// of terminal errors would be invalid combinations of settings in the
// spec, values that are unsupported by the controller, or the
// responsible controller itself being critically misconfigured.
//
// Any transient errors that occur during the reconciliation of Machines
// can be added as events to the Machine object and/or logged in the
// controller's output.
// +optional
FailureMessage *string `json:"failureMessage,omitempty"`
}

// MachineList contains a list of Machine
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit c6d8539

Please sign in to comment.