Skip to content

Commit

Permalink
Add cloud provider for Rancher with RKE2
Browse files Browse the repository at this point in the history
Signed-off-by: Cyrill Troxler <[email protected]>
  • Loading branch information
ctrox committed Jun 23, 2022
1 parent 9a76079 commit 560b331
Show file tree
Hide file tree
Showing 15 changed files with 1,962 additions and 2 deletions.
2 changes: 2 additions & 0 deletions cluster-autoscaler/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ You should also take a look at the notes and "gotchas" for your specific cloud p
* [BizflyCloud](./cloudprovider/bizflycloud/README.md)
* [Vultr](./cloudprovider/vultr/README.md)
* [TencentCloud](./cloudprovider/tencentcloud/README.md)
* [Rancher](./cloudprovider/rancher/README.md)

# Releases

Expand Down Expand Up @@ -177,3 +178,4 @@ Supported cloud providers:
* Cluster API https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/clusterapi/README.md
* Vultr https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/vultr/README.md
* TencentCloud https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/tencentcloud/README.md
* Rancher https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/rancher/README.md
8 changes: 6 additions & 2 deletions cluster-autoscaler/cloudprovider/builder/builder_all.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//go:build !gce && !aws && !azure && !kubemark && !alicloud && !magnum && !digitalocean && !clusterapi && !huaweicloud && !ionoscloud && !linode && !hetzner && !bizflycloud && !brightbox && !packet && !oci && !vultr && !tencentcloud && !externalgrpc
// +build !gce,!aws,!azure,!kubemark,!alicloud,!magnum,!digitalocean,!clusterapi,!huaweicloud,!ionoscloud,!linode,!hetzner,!bizflycloud,!brightbox,!packet,!oci,!vultr,!tencentcloud,!externalgrpc
//go:build !gce && !aws && !azure && !kubemark && !alicloud && !magnum && !digitalocean && !clusterapi && !huaweicloud && !ionoscloud && !linode && !hetzner && !bizflycloud && !brightbox && !packet && !oci && !vultr && !tencentcloud && !externalgrpc && !rancher
// +build !gce,!aws,!azure,!kubemark,!alicloud,!magnum,!digitalocean,!clusterapi,!huaweicloud,!ionoscloud,!linode,!hetzner,!bizflycloud,!brightbox,!packet,!oci,!vultr,!tencentcloud,!externalgrpc,!rancher

/*
Copyright 2018 The Kubernetes Authors.
Expand Down Expand Up @@ -42,6 +42,7 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/ovhcloud"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/packet"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/rancher"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/tencentcloud"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/vultr"
"k8s.io/autoscaler/cluster-autoscaler/config"
Expand Down Expand Up @@ -72,6 +73,7 @@ var AvailableCloudProviders = []string{
cloudprovider.PacketProviderName,
cloudprovider.VultrProviderName,
cloudprovider.TencentcloudProviderName,
cloudprovider.RancherProviderName,
}

// DefaultCloudProvider is GCE.
Expand Down Expand Up @@ -125,6 +127,8 @@ func buildCloudProvider(opts config.AutoscalingOptions, do cloudprovider.NodeGro
return vultr.BuildVultr(opts, do, rl)
case cloudprovider.TencentcloudProviderName:
return tencentcloud.BuildTencentcloud(opts, do, rl)
case cloudprovider.RancherProviderName:
return rancher.BuildRancher(opts, do, rl)
}
return nil
}
43 changes: 43 additions & 0 deletions cluster-autoscaler/cloudprovider/builder/builder_rancher.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
//go:build rancher
// +build rancher

/*
Copyright 2018 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package builder

import (
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider/rancher"
"k8s.io/autoscaler/cluster-autoscaler/config"
)

// AvailableCloudProviders supported by the cloud provider builder.
var AvailableCloudProviders = []string{
cloudprovider.RancherProviderName,
}

// DefaultCloudProvider for rancher-only build is rancher.
const DefaultCloudProvider = cloudprovider.RancherProviderName

func buildCloudProvider(opts config.AutoscalingOptions, do cloudprovider.NodeGroupDiscoveryOptions, rl *cloudprovider.ResourceLimiter) cloudprovider.CloudProvider {
switch opts.CloudProviderName {
case cloudprovider.RancherProviderName:
return rancher.BuildRancher(opts, do, rl)
}

return nil
}
2 changes: 2 additions & 0 deletions cluster-autoscaler/cloudprovider/cloud_provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ const (
TencentcloudProviderName = "tencentcloud"
// ExternalGrpcProviderName gets the provider name of the external grpc provider
ExternalGrpcProviderName = "externalgrpc"
// RancherProviderName gets the provider name of rancher
RancherProviderName = "rancher"
)

// CloudProvider contains configuration info and functions for interacting with
Expand Down
10 changes: 10 additions & 0 deletions cluster-autoscaler/cloudprovider/rancher/OWNERS
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
approvers:
#- ctrox
#- gajicdev
#- pawelkuc
#- thirdeyenick
reviewers:
#- ctrox
#- gajicdev
#- pawelkuc
#- thirdeyenick
74 changes: 74 additions & 0 deletions cluster-autoscaler/cloudprovider/rancher/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Cluster Autoscaler for Rancher with RKE2

This cluster autoscaler for Rancher scales nodes in clusters which use RKE2
provisioning (Rancher v2.6+). It uses a combination of the Rancher API and the
underlying cluster-api types of RKE2.

## Configuration

The `cluster-autoscaler` for Rancher needs a configuration file to work by
using `--cloud-config` parameter. An up-to-date example can be found in
[examples/config.yaml](./examples/config.yaml).

### Permissions

The Rancher server account provided in the `cloud-config` requires the
following permissions on the Rancher server:

* Get/Update of the `clusters.provisioning.cattle.io` resource to autoscale
* List of `machines.cluster.x-k8s.io` in the namespace of the cluster resource

## Running the Autoscaler

The `cluster-autoscaler` can be run inside the RKE2 cluster, on the Rancher
server cluster or on a completely separate machine. To run it outside the RKE2
cluster, make sure to provide a kubeconfig with `--kubeconfig`.

To start the autoscaler with the Rancher provider, the cloud provider needs to
be specified:

```bash
cluster-autoscaler --cloud-provider=rancher --cloud-config=config.yaml
```

## Enabling Autoscaling

In order for the autoscaler to function, the RKE2 cluster needs to be
configured accordingly. The autoscaler works by adjusting the `quantity` of a
`machinePool` dynamically. For the autoscaler to know the min/max size of a
`machinePool` we need to set a few annotations using the
`machineDeploymentAnnotations` field. That field has been chosen because
updating it does not trigger a full rollout of a `machinePool`.

```yaml
apiVersion: provisioning.cattle.io/v1
kind: Cluster
spec:
rkeConfig:
machinePools:
- name: pool-1
quantity: 1
workerRole: true
machineDeploymentAnnotations:
cluster.provisioning.cattle.io/autoscaler-min-size: "1"
cluster.provisioning.cattle.io/autoscaler-max-size: "3"
```
Optionally in order to enable scaling a `machinePool` from and to 0 nodes, we
need to add a few more annotations to let the autoscaler know, which resources
a single node in a pool provides:

```yaml
apiVersion: provisioning.cattle.io/v1
kind: Cluster
spec:
rkeConfig:
machinePools:
- name: pool-1
machineDeploymentAnnotations:
cluster.provisioning.cattle.io/autoscaler-min-size: "0"
cluster.provisioning.cattle.io/autoscaler-max-size: "3"
cluster.provisioning.cattle.io/autoscaler-resource-cpu: "1"
cluster.provisioning.cattle.io/autoscaler-resource-ephemeral-storage: 50Gi
cluster.provisioning.cattle.io/autoscaler-resource-memory: 4Gi
```
9 changes: 9 additions & 0 deletions cluster-autoscaler/cloudprovider/rancher/examples/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# rancher server credentials
url: https://rancher.example.org
token: <rancher token>
# name and namespace of the clusters.provisioning.cattle.io resource on the
# rancher server
clusterName: my-cluster
clusterNamespace: fleet-default
# optional, will be auto-discovered if not specified
#clusterAPIVersion: v1alpha4
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package v1

import (
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
)

// RKEMachinePool configures a RKE machine pool
type RKEMachinePool struct {
RKECommonNodeConfig `json:",inline"`

Paused bool `json:"paused,omitempty"`
EtcdRole bool `json:"etcdRole,omitempty"`
ControlPlaneRole bool `json:"controlPlaneRole,omitempty"`
WorkerRole bool `json:"workerRole,omitempty"`
DrainBeforeDelete bool `json:"drainBeforeDelete,omitempty"`
DrainBeforeDeleteTimeout *metav1.Duration `json:"drainBeforeDeleteTimeout,omitempty"`
NodeConfig *corev1.ObjectReference `json:"machineConfigRef,omitempty" wrangler:"required"`
Name string `json:"name,omitempty" wrangler:"required"`
DisplayName string `json:"displayName,omitempty"`
Quantity *int32 `json:"quantity,omitempty"`
RollingUpdate *RKEMachinePoolRollingUpdate `json:"rollingUpdate,omitempty"`
MachineDeploymentLabels map[string]string `json:"machineDeploymentLabels,omitempty"`
MachineDeploymentAnnotations map[string]string `json:"machineDeploymentAnnotations,omitempty"`
NodeStartupTimeout *metav1.Duration `json:"nodeStartupTimeout,omitempty"`
UnhealthyNodeTimeout *metav1.Duration `json:"unhealthyNodeTimeout,omitempty"`
MaxUnhealthy *string `json:"maxUnhealthy,omitempty"`
UnhealthyRange *string `json:"unhealthyRange,omitempty"`
MachineOS string `json:"machineOS,omitempty"`
}

// RKEMachinePoolRollingUpdate configures the rolling update of a machine pool
type RKEMachinePoolRollingUpdate struct {
// The maximum number of machines that can be unavailable during the update.
// Value can be an absolute number (ex: 5) or a percentage of desired
// machines (ex: 10%).
// Absolute number is calculated from percentage by rounding down.
// This can not be 0 if MaxSurge is 0.
// Defaults to 0.
// Example: when this is set to 30%, the old MachineSet can be scaled
// down to 70% of desired machines immediately when the rolling update
// starts. Once new machines are ready, old MachineSet can be scaled
// down further, followed by scaling up the new MachineSet, ensuring
// that the total number of machines available at all times
// during the update is at least 70% of desired machines.
// +optional
MaxUnavailable *intstr.IntOrString `json:"maxUnavailable,omitempty"`

// The maximum number of machines that can be scheduled above the
// desired number of machines.
// Value can be an absolute number (ex: 5) or a percentage of
// desired machines (ex: 10%).
// This can not be 0 if MaxUnavailable is 0.
// Absolute number is calculated from percentage by rounding up.
// Defaults to 1.
// Example: when this is set to 30%, the new MachineSet can be scaled
// up immediately when the rolling update starts, such that the total
// number of old and new machines do not exceed 130% of desired
// machines. Once old machines have been killed, new MachineSet can
// be scaled up further, ensuring that total number of machines running
// at any time during the update is at most 130% of desired machines.
// +optional
MaxSurge *intstr.IntOrString `json:"maxSurge,omitempty"`
}

// RKECommonNodeConfig contains common node configuration
type RKECommonNodeConfig struct {
Labels map[string]string `json:"labels,omitempty"`
Taints []corev1.Taint `json:"taints,omitempty"`
CloudCredentialSecretName string `json:"cloudCredentialSecretName,omitempty"`
}
57 changes: 57 additions & 0 deletions cluster-autoscaler/cloudprovider/rancher/rancher_clusterapi.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package rancher

import (
"fmt"

"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/client-go/discovery"
)

const (
clusterAPIGroup = "cluster.x-k8s.io"
machineDeleteAnnotationKey = clusterAPIGroup + "/delete-machine"
machinePhaseProvisioning = "Provisioning"
machinePhasePending = "Pending"
machinePhaseDeleting = "Deleting"
machineDeploymentNameLabelKey = clusterAPIGroup + "/deployment-name"
machineResourceName = "machines"
)

func getAPIGroupPreferredVersion(client discovery.DiscoveryInterface, apiGroup string) (string, error) {
groupList, err := client.ServerGroups()
if err != nil {
return "", fmt.Errorf("failed to get ServerGroups: %v", err)
}

for _, group := range groupList.Groups {
if group.Name == apiGroup {
return group.PreferredVersion.Version, nil
}
}

return "", fmt.Errorf("failed to find API group %q", apiGroup)
}

func machineGVR(version string) schema.GroupVersionResource {
return schema.GroupVersionResource{
Group: clusterAPIGroup,
Version: version,
Resource: machineResourceName,
}
}
46 changes: 46 additions & 0 deletions cluster-autoscaler/cloudprovider/rancher/rancher_config.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
/*
Copyright 2020 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package rancher

import (
"fmt"
"os"

"gopkg.in/yaml.v2"
)

type cloudConfig struct {
URL string `yaml:"url"`
Token string `yaml:"token"`
ClusterName string `yaml:"clusterName"`
ClusterNamespace string `yaml:"clusterNamespace"`
ClusterAPIVersion string `yaml:"clusterAPIVersion"`
}

func newConfig(file string) (*cloudConfig, error) {
b, err := os.ReadFile(file)
if err != nil {
return nil, fmt.Errorf("unable to read cloud config file: %w", err)
}

config := &cloudConfig{}
if err := yaml.Unmarshal(b, config); err != nil {
return nil, fmt.Errorf("unable to unmarshal config file: %w", err)
}

return config, nil
}
Loading

0 comments on commit 560b331

Please sign in to comment.