sync to kubernetes#5445 of upstream

rishabh-11 · May 6, 2023 · 8315245 · 8315245
2 parents a0cfdff + 5d55f99
commit 8315245
Show file tree

Hide file tree

Showing 895 changed files with 35,801 additions and 30,998 deletions.
diff --git a/addon-resizer/OWNERS b/addon-resizer/OWNERS
@@ -5,3 +5,5 @@ reviewers:
 emeritus_approvers:
 - bskiba # 2022-09-30
 - wojtek-t # 2022-09-30
+labels:
+- addon-resizer
diff --git a/cluster-autoscaler/FAQ.md b/cluster-autoscaler/FAQ.md
@@ -873,7 +873,7 @@ This limitation was solved with
 introduced as beta in Kubernetes 1.11 and planned for GA in 1.13.
 To allow CA to take advantage of topological scheduling, use separate node groups per zone.
 This way CA knows exactly which node group will create nodes in the required zone rather than relying on the cloud provider choosing a zone for a new node in a multi-zone node group.
-When using separate node groups per zone, the `--balance-similar-node-groups` flag will keep nodes balanced across zones for workloads that dont require topological scheduling.
+When using separate node groups per zone, the `--balance-similar-node-groups` flag will keep nodes balanced across zones for workloads that don't require topological scheduling.
 
 ### CA doesn’t work, but it used to work yesterday. Why?
 

diff --git a/cluster-autoscaler/cloudprovider/POLICY.md b/cluster-autoscaler/cloudprovider/POLICY.md
@@ -74,13 +74,14 @@ in-tree cloudprovider follows the following rules:
     * It is required that both reviewers and approvers sections of OWNERS file
       are non-empty.
     * This can create a chicken and egg problem, where adding a cloudprovider
-    requires being a member of Kubernetes org and becoming a member of the
-    organization requires a history of code contributions. For this reason it
-    is allowed for the OWNERS file to temporarily contain
-    commented out github handles. There is an expectation that at least some of
-    the owners will ultimately join Kubernetes organization (by following the
-    [process](https://github.com/kubernetes/community/blob/master/community-membership.md))
-    so that they can approve PRs to their cloudprovider.
+      requires being a member of Kubernetes org and becoming a member of the
+      organization requires a history of code contributions. For this reason it
+      is allowed for the OWNERS file to temporarily contain commented out github
+      handles. There is an expectation that at least some of the owners will
+      join Kubernetes organization (by following the
+      [process](https://github.com/kubernetes/community/blob/master/community-membership.md))
+      within one release cycly, so that they can approve PRs to their
+      cloudprovider.
   * Cloudprovider shouldn't introduce new dependencies (such as clients/SDKs)
     to top-level go.mod vendor, unless those dependencies are already imported
     by kubernetes/kubernetes repository and the same version of the library is
@@ -112,6 +113,8 @@ maintenance request_ (CMR) mechanism.
  * A CMR may be issued no later then [enhancements
    freeze](https://github.com/kubernetes/sig-release/blob/master/releases/release_phases.md#enhancements-freeze)
    of a given Kubernetes minor version.
+ * If a given cloud provider was added more than one release cycle ago and there
+   are no valid OWNERS, CMR should request OWNERS file update.
 
 Cloudprovider owners will be required to address CMR or request an exception via
 the CMR github issue. A failure to take any action will result in cloudprovider

diff --git a/cluster-autoscaler/cloudprovider/alicloud/examples/cluster-autoscaler-standard.yaml b/cluster-autoscaler/cloudprovider/alicloud/examples/cluster-autoscaler-standard.yaml
@@ -45,7 +45,7 @@ rules:
   resources: ["statefulsets", "replicasets", "daemonsets"]
   verbs: ["watch","list","get"]
 - apiGroups: ["storage.k8s.io"]
-  resources: ["storageclasses"]
+  resources: ["storageclasses", "csinodes", "csidrivers", "csistoragecapacities"]
   verbs: ["watch","list","get"]
 
 ---
@@ -109,7 +109,7 @@ metadata:
 type: Opaque
 data:
   access-key-id: [YOUR_BASE64_AK_ID]
-  access-key-id: [YOUR_BASE64_AK_SECRET]
+  access-key-secret: [YOUR_BASE64_AK_SECRET]
   region-id: [YOUR_BASE64_REGION_ID]
 
 ---

diff --git a/cluster-autoscaler/cloudprovider/aws/README.md b/cluster-autoscaler/cloudprovider/aws/README.md
@@ -521,3 +521,4 @@ Please note: it is also possible to mount the cloud config file from host:
   EC2 launch configuration has the setting `Metadata response hop limit` set to `2`.
   Otherwise, the `/latest/api/token` call will timeout and result in an error. See [AWS docs here](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-instance-metadata-service.html#configuring-instance-metadata-options) for further information.
 - If you don't use EKS managed nodegroups, don't add the `eks:nodegroup-name` tag to the ASG as this will lead to extra EKS API calls that could slow down scaling when there are 0 nodes in the nodegroup.
+- Set `AWS_MAX_ATTEMPTS` to configure max retries
diff --git a/cluster-autoscaler/cloudprovider/aws/aws-sdk-go/private/protocol/host.go b/cluster-autoscaler/cloudprovider/aws/aws-sdk-go/private/protocol/host.go
@@ -8,7 +8,7 @@ import (
 )
 
 // ValidateEndpointHostHandler is a request handler that will validate the
-// request endpoint's hosts is a valid RFC 3986 host.
+// request endpoint's hosts is a valid RFC 3986 (https://www.ietf.org/rfc/rfc3986.txt) host.
 var ValidateEndpointHostHandler = request.NamedHandler{
 	Name: "awssdk.protocol.ValidateEndpointHostHandler",
 	Fn: func(r *request.Request) {
@@ -20,7 +20,7 @@ var ValidateEndpointHostHandler = request.NamedHandler{
 }
 
 // ValidateEndpointHost validates that the host string passed in is a valid RFC
-// 3986 host. Returns error if the host is not valid.
+// 3986 (https://www.ietf.org/rfc/rfc3986.txt) host. Returns error if the host is not valid.
 func ValidateEndpointHost(opName, host string) error {
 	paramErrs := request.ErrInvalidParams{Context: opName}
 
@@ -71,7 +71,7 @@ func ValidateEndpointHost(opName, host string) error {
 	return nil
 }
 
-// ValidHostLabel returns if the label is a valid RFC 3986 host label.
+// ValidHostLabel returns if the label is a valid RFC 3986 (https://www.ietf.org/rfc/rfc3986.txt) host label.
 func ValidHostLabel(label string) bool {
 	if l := len(label); l == 0 || l > 63 {
 		return false
@@ -90,7 +90,7 @@ func ValidHostLabel(label string) bool {
 	return true
 }
 
-// ValidPortNumber return if the port is valid RFC 3986 port
+// ValidPortNumber return if the port is valid RFC 3986 (https://www.ietf.org/rfc/rfc3986.txt) port
 func ValidPortNumber(port string) bool {
 	i, err := strconv.Atoi(port)
 	if err != nil {

diff --git a/cluster-autoscaler/cloudprovider/aws/aws_sdk_provider.go b/cluster-autoscaler/cloudprovider/aws/aws_sdk_provider.go
@@ -27,6 +27,7 @@ import (
 	"k8s.io/klog/v2"
 	provider_aws "k8s.io/legacy-cloud-providers/aws"
 	"os"
+	"strconv"
 	"strings"
 )
 
@@ -49,8 +50,16 @@ func createAWSSDKProvider(configReader io.Reader) (*awsSDKProvider, error) {
 		return nil, err
 	}
 
-	sess, err := session.NewSession(aws.NewConfig().WithRegion(getRegion()).
-		WithEndpointResolver(getResolver(cfg)))
+	config := aws.NewConfig().
+		WithRegion(getRegion()).
+		WithEndpointResolver(getResolver(cfg))
+
+	config, err = setMaxRetriesFromEnv(config)
+	if err != nil {
+		return nil, err
+	}
+
+	sess, err := session.NewSession(config)
 
 	if err != nil {
 		return nil, err
@@ -63,6 +72,21 @@ func createAWSSDKProvider(configReader io.Reader) (*awsSDKProvider, error) {
 	return provider, nil
 }
 
+// setMaxRetriesFromEnv sets aws config MaxRetries by reading AWS_MAX_ATTEMPTS
+// aws sdk does not auto-set these so instead of having more config options we can reuse what the aws cli
+// does and read AWS_MAX_ATTEMPTS from the env https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-envvars.html
+func setMaxRetriesFromEnv(config *aws.Config) (*aws.Config, error) {
+	maxRetries := os.Getenv("AWS_MAX_ATTEMPTS")
+	if maxRetries != "" {
+		num, err := strconv.Atoi(maxRetries)
+		if err != nil {
+			return nil, err
+		}
+		config = config.WithMaxRetries(num)
+	}
+	return config, nil
+}
+
 type awsSDKProvider struct {
 	session *session.Session
 }

diff --git a/cluster-autoscaler/cloudprovider/clusterapi/README.md b/cluster-autoscaler/cloudprovider/clusterapi/README.md
@@ -5,6 +5,30 @@ the [cluster-api project](https://github.com/kubernetes-sigs/cluster-api) to
 manage the provisioning and de-provisioning of nodes within a Kubernetes
 cluster.
 
+## Table of Contents:
+<!-- TOC BEGIN -->
+* [Kubernetes Version](#kubernetes-version)
+* [Starting the Autoscaler](#starting-the-autoscaler)
+* [Configuring node group auto discovery](#configuring-node-group-auto-discovery)
+* [Connecting cluster-autoscaler to Cluster API management and workload Clusters](#connecting-cluster-autoscaler-to-cluster-api-management-and-workload-clusters)
+  * [Autoscaler running in a joined cluster using service account credentials](#autoscaler-running-in-a-joined-cluster-using-service-account-credentials)
+  * [Autoscaler running in workload cluster using service account credentials, with separate management cluster](#autoscaler-running-in-workload-cluster-using-service-account-credentials-with-separate-management-cluster)
+  * [Autoscaler running in management cluster using service account credentials, with separate workload cluster](#autoscaler-running-in-management-cluster-using-service-account-credentials-with-separate-workload-cluster)
+  * [Autoscaler running anywhere, with separate kubeconfigs for management and workload clusters](#autoscaler-running-anywhere-with-separate-kubeconfigs-for-management-and-workload-clusters)
+  * [Autoscaler running anywhere, with a common kubeconfig for management and workload clusters](#autoscaler-running-anywhere-with-a-common-kubeconfig-for-management-and-workload-clusters)
+* [Enabling Autoscaling](#enabling-autoscaling)
+  * [Scale from zero support](#scale-from-zero-support)
+    * [RBAC changes for scaling from zero](#rbac-changes-for-scaling-from-zero)
+    * [Pre-defined labels and taints on nodes scaled from zero](#pre-defined-labels-and-taints-on-nodes-scaled-from-zero)
+* [Specifying a Custom Resource Group](#specifying-a-custom-resource-group)
+* [Specifying a Custom Resource Version](#specifying-a-custom-resource-version)
+* [Sample manifest](#sample-manifest)
+  * [A note on permissions](#a-note-on-permissions)
+* [Autoscaling with ClusterClass and Managed Topologies](#autoscaling-with-clusterclass-and-managed-topologies)
+* [Special note on GPU instances](#special-note-on-gpu-instances)
+* [Special note on balancing similar node groups](#special-note-on-balancing-similar-node-groups)
+<!-- TOC END -->
+
 ## Kubernetes Version
 
 The cluster-api provider requires Kubernetes v1.16 or greater to run the
@@ -336,3 +360,70 @@ CAPI cloudprovider, the label format is as follows:
 `cluster-api/accelerator=<gpu-type>`
 
 `<gpu-type>` is arbitrary.
+
+It is important to note that if you are using the `--gpu-total` flag to limit the number
+of GPU resources in your cluster that the `<gpu-type>` value must match
+between the command line flag and the node labels. Setting these values incorrectly
+can lead to the autoscaler creating too many GPU resources.
+
+For example, if you are using the autoscaler command line flag
+`--gpu-total=gfx-hardware:1:2` to limit the number of `gfx-hardware` resources
+to a minimum of 1 and maximum of 2, then you should use the kubelet node label flag
+`--node-labels=cluster-api/accelerator=gfx-hardware`.
+
+## Special note on balancing similar node groups
+
+The Cluster Autoscaler feature to enable balancing similar node groups
+(activated with the `--balance-similar-node-groups` flag) is a powerful and
+popular feature. When enabled, the Cluster Autoscaler will attempt to create
+new nodes by adding them in a manner that balances the creation between
+similar node groups. With Cluster API, these node groups correspond directly
+to the scalable resources associated (usually MachineDeployments and MachineSets)
+with the nodes in question. In order for the nodes of these scalable resources
+to be considered similar by the Cluster Autoscaler, they must have the same
+capacity, labels, and taints for the nodes which will be created from them.
+
+To help assist the Cluster Autoscaler in determining which node groups are
+similar, the command line flags `--balancing-ignore-label` and
+`--balancing-label` are provided. For an expanded discussion about balancing
+similar node groups and the options which are available, please see the
+[Cluster Autoscaler FAQ](../../FAQ.md).
+
+Because Cluster API can address many different cloud providers, it is important
+to configure the balancing labels to ignore provider-specific labels which
+are used for carrying zonal information on Kubernetes nodes. The Cluster
+Autoscaler implementation for Cluster API does not assume any labels (aside from
+the [well-known Kubernetes labels](https://kubernetes.io/docs/reference/labels-annotations-taints/))
+to be ignored when running. Users must configure their Cluster Autoscaler deployment
+to ignore labels which might be different between nodes, but which do not
+otherwise affect node behavior or size (for example when two MachineDeployments
+are the same except for their deployment zones). The Cluster API community has
+decided not to carry cloud provider specific labels in the Cluster Autoscaler
+to reduce the possibility for labels to clash between providers. Additionally,
+the community has agreed to promote documentation and the use of the `--balancing-ignore-label`
+flag as the preferred method of deployment to reduce the extended need for
+maintenance on the Cluster Autoscaler when new providers are added or updated.
+For further context around this decision, please see the
+[Cluster API Deep Dive into Cluster Autoscaler Node Group Balancing discussion from 2022-09-12](https://www.youtube.com/watch?v=jbhca_9oPuQ&t=5s).
+
+The following table shows some of the most common labels used by cloud providers
+to designate regional or zonal information on Kubernetes nodes. It is shared
+here as a reference for users who might be deploying on these infrastructures.
+
+| Cloud Provider | Label to ignore | Notes |
+| --- | --- | --- |
+| Alibaba Cloud | `topology.diskplugin.csi.alibabacloud.com/zone` | Used by the Alibaba Cloud CSI driver as a target for persistent volume node affinity |
+| AWS | `alpha.eksctl.io/instance-id` | Used by `eksctl` to identify instances |
+| AWS | `alpha.eksctl.io/nodegroup-name` | Used by `eksctl` to identify node group names |
+| AWS | `eks.amazonaws.com/nodegroup` | Used by EKS to identify node groups |
+| AWS | `k8s.amazonaws.com/eniConfig` | Used by the AWS CNI for custom networking |
+| AWS | `lifecycle` | Used by AWS as a label for spot instances |
+| AWS | `topology.ebs.csi.aws.com/zone` | Used by the AWS EBS CSI driver as a target for persistent volume node affinity |
+| Azure | `topology.disk.csi.azure.com/zone` | Used as the topology key by the Azure Disk CSI driver |
+| Azure | `agentpool` | Legacy label used to specify to which Azure node pool a particular node belongs |
+| Azure | `kubernetes.azure.com/agentpool` | Used by AKS to identify to which node pool a particular node belongs |
+| GCE | `topology.gke.io/zone` | Used to specify the zone of the node |
+| IBM Cloud | `ibm-cloud.kubernetes.io/worker-id` | Used by the IBM Cloud Cloud Controller Manager to identify the node |
+| IBM Cloud | `vpc-block-csi-driver-labels` | Used by the IBM Cloud CSI driver as a target for persistent volume node affinity |
+| IBM Cloud | `ibm-cloud.kubernetes.io/vpc-instance-id` | Used when a VPC is in use on IBM Cloud |
+
diff --git a/cluster-autoscaler/cloudprovider/gce/gce_cloud_provider.go b/cluster-autoscaler/cloudprovider/gce/gce_cloud_provider.go
@@ -182,7 +182,6 @@ type Mig interface {
 	cloudprovider.NodeGroup
 
 	GceRef() GceRef
-	Version() string
 }
 
 type gceMig struct {
@@ -193,11 +192,6 @@ type gceMig struct {
 	maxSize    int
 }
 
-// Version return the Mig version.
-func (mig *gceMig) Version() string {
-	return ""
-}
-
 // GceRef returns Mig's GceRef
 func (mig *gceMig) GceRef() GceRef {
 	return mig.gceRef

diff --git a/cluster-autoscaler/cloudprovider/gce/gce_manager.go b/cluster-autoscaler/cloudprovider/gce/gce_manager.go
@@ -589,7 +589,11 @@ func (m *gceManagerImpl) GetMigTemplateNode(mig Mig) (*apiv1.Node, error) {
 	if err != nil {
 		return nil, err
 	}
-	return m.templates.BuildNodeFromTemplate(mig, template, machineType.CPU, machineType.Memory, nil, m.reserved)
+	migOsInfo, err := m.templates.MigOsInfo(mig.Id(), template)
+	if err != nil {
+		return nil, err
+	}
+	return m.templates.BuildNodeFromTemplate(mig, migOsInfo, template, machineType.CPU, machineType.Memory, nil, m.reserved)
 }
 
 // parseMIGAutoDiscoverySpecs returns any provided NodeGroupAutoDiscoverySpecs

diff --git a/cluster-autoscaler/cloudprovider/gce/gce_reserved.go b/cluster-autoscaler/cloudprovider/gce/gce_reserved.go
@@ -87,7 +87,10 @@ type GceReserved struct{}
 
 // CalculateKernelReserved computes how much memory Linux kernel will reserve.
 // TODO(jkaniuk): account for crashkernel reservation on RHEL / CentOS
-func (r *GceReserved) CalculateKernelReserved(physicalMemory int64, os OperatingSystem, osDistribution OperatingSystemDistribution, arch SystemArchitecture, nodeVersion string) int64 {
+func (r *GceReserved) CalculateKernelReserved(m MigOsInfo, physicalMemory int64) int64 {
+	os := m.Os()
+	osDistribution := m.OsDistribution()
+	arch := m.Arch()
 	switch os {
 	case OperatingSystemLinux:
 		// Account for memory reserved by kernel
@@ -267,7 +270,9 @@ func EphemeralStorageOnLocalSSDFilesystemOverheadInBytes(diskCount int64, osDist
 }
 
 // CalculateOSReservedEphemeralStorage estimates how much ephemeral storage OS will reserve and eviction threshold
-func (r *GceReserved) CalculateOSReservedEphemeralStorage(diskSize int64, os OperatingSystem, osDistribution OperatingSystemDistribution, arch SystemArchitecture, nodeVersion string) int64 {
+func (r *GceReserved) CalculateOSReservedEphemeralStorage(m MigOsInfo, diskSize int64) int64 {
+	osDistribution := m.OsDistribution()
+	arch := m.Arch()
 	switch osDistribution {
 	case OperatingSystemDistributionCOS:
 		storage := int64(math.Ceil(0.015635*float64(diskSize))) + int64(math.Ceil(4.148*GiB)) // os partition estimation
@@ -289,3 +294,30 @@ func (r *GceReserved) CalculateOSReservedEphemeralStorage(diskSize int64, os Ope
 		return 0
 	}
 }
+
+// GceMigOsInfo contains os details of nodes in gce mig.
+type GceMigOsInfo struct {
+	os             OperatingSystem
+	osDistribution OperatingSystemDistribution
+	arch           SystemArchitecture
+}
+
+// Os return operating system.
+func (m *GceMigOsInfo) Os() OperatingSystem {
+	return m.os
+}
+
+// OsDistribution return operating system distribution.
+func (m *GceMigOsInfo) OsDistribution() OperatingSystemDistribution {
+	return m.osDistribution
+}
+
+// Arch return system architecture.
+func (m *GceMigOsInfo) Arch() SystemArchitecture {
+	return m.arch
+}
+
+// NewMigOsInfo return gce implementation of MigOsInfo interface.
+func NewMigOsInfo(os OperatingSystem, osDistribution OperatingSystemDistribution, arch SystemArchitecture) MigOsInfo {
+	return &GceMigOsInfo{os, osDistribution, arch}
+}
diff --git a/cluster-autoscaler/cloudprovider/gce/gce_reserved_test.go b/cluster-autoscaler/cloudprovider/gce/gce_reserved_test.go
@@ -108,7 +108,8 @@ func TestCalculateKernelReservedLinux(t *testing.T) {
 	for idx, tc := range testCases {
 		r := &GceReserved{}
 		t.Run(fmt.Sprintf("%v", idx), func(t *testing.T) {
-			reserved := r.CalculateKernelReserved(tc.physicalMemory, OperatingSystemLinux, tc.osDistribution, tc.arch, "")
+			m := NewMigOsInfo(OperatingSystemLinux, tc.osDistribution, tc.arch)
+			reserved := r.CalculateKernelReserved(m, tc.physicalMemory)
 			if tc.osDistribution == OperatingSystemDistributionUbuntu {
 				assert.Equal(t, tc.reservedMemory+int64(math.Min(correctionConstant*float64(tc.physicalMemory), maximumCorrectionValue)+ubuntuSpecificOffset), reserved)
 			} else if tc.osDistribution == OperatingSystemDistributionCOS {

diff --git a/cluster-autoscaler/cloudprovider/gce/os_reserved.go b/cluster-autoscaler/cloudprovider/gce/os_reserved.go
@@ -16,13 +16,23 @@ limitations under the License.
 
 package gce
 
+// MigOsInfo store os parameters.
+type MigOsInfo interface {
+	// Os return operating system.
+	Os() OperatingSystem
+	// OsDistribution return operating system distribution.
+	OsDistribution() OperatingSystemDistribution
+	// Arch return system architecture.
+	Arch() SystemArchitecture
+}
+
 // OsReservedCalculator calculates the OS reserved values.
 type OsReservedCalculator interface {
 	// CalculateKernelReserved computes how much memory OS kernel will reserve.
 	// NodeVersion parameter is optional. If empty string is passed a result calculated using default node version will be returned.
-	CalculateKernelReserved(physicalMemory int64, os OperatingSystem, osDistribution OperatingSystemDistribution, arch SystemArchitecture, nodeVersion string) int64
+	CalculateKernelReserved(m MigOsInfo, physicalMemory int64) int64
 
 	// CalculateOSReservedEphemeralStorage estimates how much ephemeral storage OS will reserve and eviction threshold.
 	// NodeVersion parameter is optional. If empty string is passed a result calculated using default node version will be returned.
-	CalculateOSReservedEphemeralStorage(diskSize int64, os OperatingSystem, osDistribution OperatingSystemDistribution, arch SystemArchitecture, nodeVersion string) int64
+	CalculateOSReservedEphemeralStorage(m MigOsInfo, diskSize int64) int64
 }