diff --git a/cluster-autoscaler/cloudprovider/clusterapi/README.md b/cluster-autoscaler/cloudprovider/clusterapi/README.md index 6bbfa1914612..d68e4b2ddaf8 100644 --- a/cluster-autoscaler/cloudprovider/clusterapi/README.md +++ b/cluster-autoscaler/cloudprovider/clusterapi/README.md @@ -169,8 +169,7 @@ cluster-autoscaler --cloud-provider=clusterapi \ To enable the automatic scaling of components in your cluster-api managed cloud there are a few annotations you need to provide. These annotations -must be applied to either [MachineSet](https://cluster-api.sigs.k8s.io/developer/architecture/controllers/machine-set.html) -or [MachineDeployment](https://cluster-api.sigs.k8s.io/developer/architecture/controllers/machine-deployment.html) +must be applied to either [MachineSet](https://cluster-api.sigs.k8s.io/developer/architecture/controllers/machine-set.html), [MachineDeployment](https://cluster-api.sigs.k8s.io/developer/architecture/controllers/machine-deployment.html), or [MachinePool](https://cluster-api.sigs.k8s.io/developer/architecture/controllers/machine-pool.html) resources depending on the type of cluster-api mechanism that you are using. There are two annotations that control how a cluster resource should be scaled: @@ -185,9 +184,13 @@ There are two annotations that control how a cluster resource should be scaled: the maximum number of nodes for the associated resource group. The autoscaler will not scale the group above this number. -The autoscaler will monitor any `MachineSet` or `MachineDeployment` containing +The autoscaler will monitor any `MachineSet`, `MachineDeployment`, or `MachinePool` containing both of these annotations. +> Note: `MachinePool` support in cluster-autoscaler requires a provider implementation +> that supports the new "MachinePool Machines" feature. MachinePools in Cluster API are +> considered an [experimental feature](https://cluster-api.sigs.k8s.io/tasks/experimental-features/experimental-features.html#active-experimental-features) and are not enabled by default. + ### Scale from zero support The Cluster API community has defined an opt-in method for infrastructure @@ -256,7 +259,7 @@ rules: #### Pre-defined labels and taints on nodes scaled from zero To provide labels or taint information for scale from zero, the optional -capacity annotations may be supplied as a comma separated list, as +capacity annotations may be supplied as a comma separated list, as demonstrated in the example below: ```yaml @@ -347,12 +350,12 @@ spec: class: "quick-start" version: v1.24.0 controlPlane: - replicas: 1 + replicas: 1 workers: machineDeployments: - class: default-worker name: linux - ## replicas field is not set. + ## replicas field is not set. ## replicas: 1 ``` diff --git a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller.go b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller.go index 8fbbc15fd39d..567bba5e0d1c 100644 --- a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller.go +++ b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller.go @@ -42,9 +42,10 @@ import ( ) const ( - machineProviderIDIndex = "machineProviderIDIndex" - nodeProviderIDIndex = "nodeProviderIDIndex" - defaultCAPIGroup = "cluster.x-k8s.io" + machineProviderIDIndex = "machineProviderIDIndex" + machinePoolProviderIDIndex = "machinePoolProviderIDIndex" + nodeProviderIDIndex = "nodeProviderIDIndex" + defaultCAPIGroup = "cluster.x-k8s.io" // CAPIGroupEnvVar contains the environment variable name which allows overriding defaultCAPIGroup. CAPIGroupEnvVar = "CAPI_GROUP" // CAPIVersionEnvVar contains the environment variable name which allows overriding the Cluster API group version. @@ -52,18 +53,20 @@ const ( resourceNameMachine = "machines" resourceNameMachineSet = "machinesets" resourceNameMachineDeployment = "machinedeployments" + resourceNameMachinePool = "machinepools" failedMachinePrefix = "failed-machine-" pendingMachinePrefix = "pending-machine-" machineTemplateKind = "MachineTemplate" machineDeploymentKind = "MachineDeployment" machineSetKind = "MachineSet" + machinePoolKind = "MachinePool" machineKind = "Machine" autoDiscovererTypeClusterAPI = "clusterapi" autoDiscovererClusterNameKey = "clusterName" autoDiscovererNamespaceKey = "namespace" ) -// machineController watches for Nodes, Machines, MachineSets and +// machineController watches for Nodes, Machines, MachinePools, MachineSets, and // MachineDeployments as they are added, updated and deleted on the // cluster. Additionally, it adds indices to the node informers to // satisfy lookup by node.Spec.ProviderID. @@ -73,11 +76,14 @@ type machineController struct { machineDeploymentInformer informers.GenericInformer machineInformer informers.GenericInformer machineSetInformer informers.GenericInformer + machinePoolInformer informers.GenericInformer nodeInformer cache.SharedIndexInformer managementClient dynamic.Interface managementScaleClient scale.ScalesGetter machineSetResource schema.GroupVersionResource machineResource schema.GroupVersionResource + machinePoolResource schema.GroupVersionResource + machinePoolsAvailable bool machineDeploymentResource schema.GroupVersionResource machineDeploymentsAvailable bool accessLock sync.Mutex @@ -88,6 +94,29 @@ type machineController struct { stopChannel <-chan struct{} } +func indexMachinePoolByProviderID(obj interface{}) ([]string, error) { + u, ok := obj.(*unstructured.Unstructured) + if !ok { + return nil, nil + } + + providerIDList, found, err := unstructured.NestedStringSlice(u.UnstructuredContent(), "spec", "providerIDList") + if err != nil || !found { + return nil, nil + } + if len(providerIDList) == 0 { + return nil, nil + } + + normalizedProviderIDs := make([]string, len(providerIDList)) + + for i, s := range providerIDList { + normalizedProviderIDs[i] = string(normalizedProviderString(s)) + } + + return normalizedProviderIDs, nil +} + func indexMachineByProviderID(obj interface{}) ([]string, error) { u, ok := obj.(*unstructured.Unstructured) if !ok { @@ -188,6 +217,9 @@ func (c *machineController) run() error { if c.machineDeploymentsAvailable { syncFuncs = append(syncFuncs, c.machineDeploymentInformer.Informer().HasSynced) } + if c.machinePoolsAvailable { + syncFuncs = append(syncFuncs, c.machinePoolInformer.Informer().HasSynced) + } klog.V(4).Infof("waiting for caches to sync") if !cache.WaitForCacheSync(c.stopChannel, syncFuncs...) { @@ -198,31 +230,39 @@ func (c *machineController) run() error { } func (c *machineController) findScalableResourceByProviderID(providerID normalizedProviderID) (*unstructured.Unstructured, error) { + // Check for a MachinePool first to simplify the logic afterward. + if c.machinePoolsAvailable { + machinePool, err := c.findMachinePoolByProviderID(providerID) + if err != nil { + return nil, err + } + if machinePool != nil { + return machinePool, nil + } + } + + // Check for a MachineSet. machine, err := c.findMachineByProviderID(providerID) if err != nil { return nil, err } - if machine == nil { return nil, nil } - machineSet, err := c.findMachineOwner(machine) if err != nil { return nil, err } - if machineSet == nil { return nil, nil } + // Check for a MachineDeployment. if c.machineDeploymentsAvailable { machineDeployment, err := c.findMachineSetOwner(machineSet) if err != nil { return nil, err } - - // If a matching machineDeployment was found return it if machineDeployment != nil { return machineDeployment, nil } @@ -231,6 +271,28 @@ func (c *machineController) findScalableResourceByProviderID(providerID normaliz return machineSet, nil } +// findMachinePoolByProviderID finds machine pools matching providerID. A +// DeepCopy() of the object is returned on success. +func (c *machineController) findMachinePoolByProviderID(providerID normalizedProviderID) (*unstructured.Unstructured, error) { + objs, err := c.machinePoolInformer.Informer().GetIndexer().ByIndex(machinePoolProviderIDIndex, string(providerID)) + if err != nil { + return nil, err + } + + switch n := len(objs); { + case n > 1: + return nil, fmt.Errorf("internal error; expected len==1, got %v", n) + case n == 1: + u, ok := objs[0].(*unstructured.Unstructured) + if !ok { + return nil, fmt.Errorf("internal error; unexpected type %T", objs[0]) + } + return u.DeepCopy(), nil + default: + return nil, nil + } +} + // findMachineByProviderID finds machine matching providerID. A // DeepCopy() of the object is returned on success. func (c *machineController) findMachineByProviderID(providerID normalizedProviderID) (*unstructured.Unstructured, error) { @@ -337,7 +399,7 @@ func getCAPIVersion() string { } // newMachineController constructs a controller that watches Nodes, -// Machines and MachineSet as they are added, updated and deleted on +// Machines, MachinePools, MachineDeployments, and MachineSets as they are added, updated, and deleted on // the cluster. func newMachineController( managementClient dynamic.Interface, @@ -395,6 +457,32 @@ func newMachineController( return nil, fmt.Errorf("failed to add event handler for resource %q: %v", resourceNameMachineSet, err) } + var gvrMachinePool schema.GroupVersionResource + var machinePoolInformer informers.GenericInformer + + machinePoolsAvailable, err := groupVersionHasResource(managementDiscoveryClient, + fmt.Sprintf("%s/%s", CAPIGroup, CAPIVersion), resourceNameMachinePool) + if err != nil { + return nil, fmt.Errorf("failed to validate if resource %q is available for group %q: %v", + resourceNameMachinePool, fmt.Sprintf("%s/%s", CAPIGroup, CAPIVersion), err) + } + + if machinePoolsAvailable { + gvrMachinePool = schema.GroupVersionResource{ + Group: CAPIGroup, + Version: CAPIVersion, + Resource: resourceNameMachinePool, + } + machinePoolInformer = managementInformerFactory.ForResource(gvrMachinePool) + machinePoolInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{}) + + if err := machinePoolInformer.Informer().GetIndexer().AddIndexers(cache.Indexers{ + machinePoolProviderIDIndex: indexMachinePoolByProviderID, + }); err != nil { + return nil, fmt.Errorf("cannot add machine pool indexer: %v", err) + } + } + gvrMachine := schema.GroupVersionResource{ Group: CAPIGroup, Version: CAPIVersion, @@ -429,10 +517,13 @@ func newMachineController( machineDeploymentInformer: machineDeploymentInformer, machineInformer: machineInformer, machineSetInformer: machineSetInformer, + machinePoolInformer: machinePoolInformer, nodeInformer: nodeInformer, managementClient: managementClient, managementScaleClient: managementScaleClient, machineSetResource: gvrMachineSet, + machinePoolResource: gvrMachinePool, + machinePoolsAvailable: machinePoolsAvailable, machineResource: gvrMachine, machineDeploymentResource: gvrMachineDeployment, machineDeploymentsAvailable: machineDeploymentAvailable, @@ -475,12 +566,37 @@ func getAPIGroupPreferredVersion(client discovery.DiscoveryInterface, APIGroup s } func (c *machineController) scalableResourceProviderIDs(scalableResource *unstructured.Unstructured) ([]string, error) { + if scalableResource.GetKind() == machinePoolKind { + return c.findMachinePoolProviderIDs(scalableResource) + } + return c.findScalableResourceProviderIDs(scalableResource) +} + +func (c *machineController) findMachinePoolProviderIDs(scalableResource *unstructured.Unstructured) ([]string, error) { + var providerIDs []string + + providerIDList, found, err := unstructured.NestedStringSlice(scalableResource.UnstructuredContent(), "spec", "providerIDList") + if err != nil { + return nil, err + } + if found { + providerIDs = providerIDList + } else { + klog.Warningf("Machine Pool %q has no providerIDList", scalableResource.GetName()) + } + + klog.V(4).Infof("nodegroup %s has %d nodes: %v", scalableResource.GetName(), len(providerIDs), providerIDs) + return providerIDs, nil +} + +func (c *machineController) findScalableResourceProviderIDs(scalableResource *unstructured.Unstructured) ([]string, error) { + var providerIDs []string + machines, err := c.listMachinesForScalableResource(scalableResource) if err != nil { return nil, fmt.Errorf("error listing machines: %v", err) } - var providerIDs []string for _, machine := range machines { providerID, found, err := unstructured.NestedString(machine.UnstructuredContent(), "spec", "providerID") if err != nil { @@ -550,8 +666,7 @@ func (c *machineController) scalableResourceProviderIDs(scalableResource *unstru } } - klog.V(4).Infof("nodegroup %s has nodes %v", scalableResource.GetName(), providerIDs) - + klog.V(4).Infof("nodegroup %s has %d nodes: %v", scalableResource.GetName(), len(providerIDs), providerIDs) return providerIDs, nil } @@ -666,6 +781,16 @@ func (c *machineController) listScalableResources() ([]*unstructured.Unstructure scalableResources = append(scalableResources, machineDeployments...) } + + if c.machinePoolsAvailable { + machinePools, err := c.listResources(c.machinePoolInformer.Lister()) + if err != nil { + return nil, err + } + + scalableResources = append(scalableResources, machinePools...) + } + return scalableResources, nil } diff --git a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller_test.go b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller_test.go index ad9d6ad63eca..359111be6bb1 100644 --- a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller_test.go +++ b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_controller_test.go @@ -56,6 +56,7 @@ type testConfig struct { machineDeployment *unstructured.Unstructured machineSet *unstructured.Unstructured machineTemplate *unstructured.Unstructured + machinePool *unstructured.Unstructured machines []*unstructured.Unstructured nodes []*corev1.Node } @@ -65,6 +66,7 @@ type testSpec struct { capacity map[string]string machineDeploymentName string machineSetName string + machinePoolName string clusterName string namespace string nodeCount int @@ -106,9 +108,11 @@ func mustCreateTestController(t *testing.T, testConfigs ...*testConfig) (*machin {Group: "cluster.x-k8s.io", Version: "v1alpha3", Resource: "machinedeployments"}: "kindList", {Group: "cluster.x-k8s.io", Version: "v1alpha3", Resource: "machines"}: "kindList", {Group: "cluster.x-k8s.io", Version: "v1alpha3", Resource: "machinesets"}: "kindList", + {Group: "cluster.x-k8s.io", Version: "v1alpha3", Resource: "machinepools"}: "kindList", {Group: "cluster.x-k8s.io", Version: "v1beta1", Resource: "machinedeployments"}: "kindList", {Group: "cluster.x-k8s.io", Version: "v1beta1", Resource: "machines"}: "kindList", {Group: "cluster.x-k8s.io", Version: "v1beta1", Resource: "machinesets"}: "kindList", + {Group: "custom.x-k8s.io", Version: "v1beta1", Resource: "machinepools"}: "kindList", {Group: "custom.x-k8s.io", Version: "v1beta1", Resource: "machinedeployments"}: "kindList", {Group: "custom.x-k8s.io", Version: "v1beta1", Resource: "machines"}: "kindList", {Group: "custom.x-k8s.io", Version: "v1beta1", Resource: "machinesets"}: "kindList", @@ -139,6 +143,9 @@ func mustCreateTestController(t *testing.T, testConfigs ...*testConfig) (*machin { Name: resourceNameMachine, }, + { + Name: resourceNameMachinePool, + }, }, }, { @@ -153,6 +160,9 @@ func mustCreateTestController(t *testing.T, testConfigs ...*testConfig) (*machin { Name: resourceNameMachine, }, + { + Name: resourceNameMachinePool, + }, }, }, }, @@ -162,8 +172,8 @@ func mustCreateTestController(t *testing.T, testConfigs ...*testConfig) (*machin scaleClient := &fakescale.FakeScaleClient{Fake: clientgotesting.Fake{}} scaleReactor := func(action clientgotesting.Action) (bool, runtime.Object, error) { resource := action.GetResource().Resource - if resource != resourceNameMachineSet && resource != resourceNameMachineDeployment { - // Do not attempt to react to resources that are not MachineSet or MachineDeployment + if resource != resourceNameMachineSet && resource != resourceNameMachineDeployment && resource != resourceNameMachinePool { + // Do not attempt to react to resources that are not MachineSet, MachineDeployment, or MachinePool return false, nil, nil } @@ -495,6 +505,12 @@ func addTestConfigs(t *testing.T, controller *machineController, testConfigs ... return err } + if config.machinePool != nil { + if err := createResource(controller.managementClient, controller.machinePoolInformer, controller.machinePoolResource, config.machinePool); err != nil { + return err + } + } + for i := range config.machines { if err := createResource(controller.managementClient, controller.machineInformer, controller.machineResource, config.machines[i]); err != nil { return err diff --git a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup.go b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup.go index ef3130ede398..d5708271ac27 100644 --- a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup.go +++ b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup.go @@ -102,7 +102,7 @@ func (ng *nodegroup) DeleteNodes(nodes []*corev1.Node) error { } // if we are at minSize already we wail early. - if int(replicas) <= ng.MinSize() { + if replicas <= ng.MinSize() { return fmt.Errorf("min size reached, nodes will not be deleted") } diff --git a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_unstructured.go b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_unstructured.go index 81fccf43e28d..4eec0e4bf7ec 100644 --- a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_unstructured.go +++ b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_unstructured.go @@ -63,6 +63,8 @@ func (r unstructuredScalableResource) GroupVersionResource() (schema.GroupVersio return r.controller.machineDeploymentResource, nil case machineSetKind: return r.controller.machineSetResource, nil + case machinePoolKind: + return r.controller.machinePoolResource, nil default: return schema.GroupVersionResource{}, fmt.Errorf("unknown scalable resource kind %s", r.Kind()) } diff --git a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_utils.go b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_utils.go index d10c8ce21ed5..7d007363d090 100644 --- a/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_utils.go +++ b/cluster-autoscaler/cloudprovider/clusterapi/clusterapi_utils.go @@ -142,9 +142,11 @@ func parseScalingBounds(annotations map[string]string) (int, int, error) { } func getOwnerForKind(u *unstructured.Unstructured, kind string) *metav1.OwnerReference { - for _, ref := range u.GetOwnerReferences() { - if ref.Kind == kind && ref.Name != "" { - return ref.DeepCopy() + if u != nil { + for _, ref := range u.GetOwnerReferences() { + if ref.Kind == kind && ref.Name != "" { + return ref.DeepCopy() + } } }