Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support scale up of nodepool with zero nodes with ephemeral storage r… #6607

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 14 additions & 11 deletions cluster-autoscaler/cloudprovider/oci/common/oci_shape.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import (

// ShapeGetter returns the oci shape attributes for the pool.
type ShapeGetter interface {
GetNodePoolShape(*oke.NodePool) (*Shape, error)
GetNodePoolShape(*oke.NodePool, int64) (*Shape, error)
GetInstancePoolShape(pool *core.InstancePool) (*Shape, error)
Refresh()
}
Expand Down Expand Up @@ -51,10 +51,11 @@ func (cc ShapeClientImpl) ListShapes(ctx context.Context, req core.ListShapesReq
// Shape includes the resource attributes of a given shape which should be used
// for constructing node templates.
type Shape struct {
Name string
CPU float32
GPU int
MemoryInBytes float32
Name string
CPU float32
GPU int
MemoryInBytes float32
EphemeralStorageInBytes float32
}

// CreateShapeGetter creates a new oci shape getter.
Expand All @@ -78,14 +79,15 @@ func (osf *shapeGetterImpl) Refresh() {
}

// GetNodePoolShape gets the shape by querying the node pool's configuration
func (osf *shapeGetterImpl) GetNodePoolShape(np *oke.NodePool) (*Shape, error) {
func (osf *shapeGetterImpl) GetNodePoolShape(np *oke.NodePool, ephemeralStorage int64) (*Shape, error) {
shapeName := *np.NodeShape
if np.NodeShapeConfig != nil {
return &Shape{
CPU: *np.NodeShapeConfig.Ocpus * 2,
// num_bytes * kilo * mega * giga
MemoryInBytes: *np.NodeShapeConfig.MemoryInGBs * 1024 * 1024 * 1024,
GPU: 0,
MemoryInBytes: *np.NodeShapeConfig.MemoryInGBs * 1024 * 1024 * 1024,
GPU: 0,
EphemeralStorageInBytes: float32(ephemeralStorage),
}, nil
}

Expand Down Expand Up @@ -113,9 +115,10 @@ func (osf *shapeGetterImpl) GetNodePoolShape(np *oke.NodePool) (*Shape, error) {
// Update the cache based on latest results
for _, s := range resp.Items {
osf.cache[*s.Shape] = &Shape{
CPU: getFloat32(s.Ocpus) * 2, // convert ocpu to vcpu
GPU: getInt(s.Gpus),
MemoryInBytes: getFloat32(s.MemoryInGBs) * 1024 * 1024 * 1024,
CPU: getFloat32(s.Ocpus) * 2, // convert ocpu to vcpu
GPU: getInt(s.Gpus),
MemoryInBytes: getFloat32(s.MemoryInGBs) * 1024 * 1024 * 1024,
EphemeralStorageInBytes: float32(ephemeralStorage),
}
}

Expand Down
16 changes: 9 additions & 7 deletions cluster-autoscaler/cloudprovider/oci/common/oci_shape_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,9 +95,10 @@ func TestNodePoolGetShape(t *testing.T) {
"basic shape": {
shape: "VM.Standard1.2",
expected: &Shape{
CPU: 4,
MemoryInBytes: 16 * 1024 * 1024 * 1024,
GPU: 0,
CPU: 4,
MemoryInBytes: 16 * 1024 * 1024 * 1024,
GPU: 0,
EphemeralStorageInBytes: -1,
},
},
"flex shape": {
Expand All @@ -107,9 +108,10 @@ func TestNodePoolGetShape(t *testing.T) {
MemoryInGBs: common.Float32(64),
},
expected: &Shape{
CPU: 8,
MemoryInBytes: 4 * 16 * 1024 * 1024 * 1024,
GPU: 0,
CPU: 8,
MemoryInBytes: 4 * 16 * 1024 * 1024 * 1024,
GPU: 0,
EphemeralStorageInBytes: -1,
},
},
}
Expand All @@ -118,7 +120,7 @@ func TestNodePoolGetShape(t *testing.T) {
shapeGetter := CreateShapeGetter(shapeClient)

t.Run(name, func(t *testing.T) {
shape, err := shapeGetter.GetNodePoolShape(&oke.NodePool{NodeShape: &tc.shape, NodeShapeConfig: tc.shapeConfig})
shape, err := shapeGetter.GetNodePoolShape(&oke.NodePool{NodeShape: &tc.shape, NodeShapeConfig: tc.shapeConfig}, -1)
if err != nil {
t.Fatal(err)
}
Expand Down
27 changes: 27 additions & 0 deletions cluster-autoscaler/cloudprovider/oci/common/oci_tag.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/*
Copyright 2021-2024 Oracle and/or its affiliates.
*/

package common

import (
oke "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/oci/vendor-internal/github.com/oracle/oci-go-sdk/v65/containerengine"
)

// TagsGetter returns the oci tags for the pool.
type TagsGetter interface {
GetNodePoolFreeformTags(*oke.NodePool) (map[string]string, error)
}

// TagsGetterImpl is the implementation to fetch the oci tags for the pool.
type TagsGetterImpl struct{}

// CreateTagsGetter creates a new oci tags getter.
func CreateTagsGetter() TagsGetter {
return &TagsGetterImpl{}
}

// GetNodePoolFreeformTags returns the FreeformTags for the nodepool
func (tgi *TagsGetterImpl) GetNodePoolFreeformTags(np *oke.NodePool) (map[string]string, error) {
return np.FreeformTags, nil
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,7 @@ const (
// ToBeDeletedByClusterAutoscaler is the taint used to ensure that after a node has been called to be deleted
// no more pods will schedule onto it
ToBeDeletedByClusterAutoscaler = "ignore-taint.cluster-autoscaler.kubernetes.io/oke-impending-node-termination"

// EphemeralStorageSize is the freeform tag key that would be used to determine the ephemeral-storage size of the node
EphemeralStorageSize = "cluster-autoscaler/node-ephemeral-storage"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given that this is an OKE specific implementation, I'm on the fence whether we should make the name indicate that it's custom/OKE specific. 🤔

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we using free-form tags to provide local disk information? Is there something special about the scale from 0 scenario that prevents the OKE and InstancePool implementations from using the LocalDisksTotalSizeInGBs and LocalDisks fields of core.shape when populating the node template in buildNodeFromTemplate?

Copy link
Contributor

@jlamillan jlamillan Mar 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Assuming ephemeral storage (like memory and CPU) can be determined from the shape, it is still useful to optionally support using free-form tags to override shape specific values like CPU, memory, ephemeral-storage, GPU, etc..

However, in my opinion, the values from core.shape should remain the default source resource information for a node when they are available.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK... after looking into this more, it looks like most shapes use "block storage only". This means that most of the time LocalDisksTotalSizeInGBs and LocalDisks will not be set.

I still think it's a good idea for both the node-pool and instance-pool implementations to check for and use shape.LocalDisksTotalSizeInGBs from core.shape by default. However, given that it's going to be unset most of the time, it seems like a good idea to add the ability to receive these values via free-form tags.

)
37 changes: 35 additions & 2 deletions cluster-autoscaler/cloudprovider/oci/nodepools/oci_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ func CreateNodePoolManager(cloudConfigPath string, discoveryOpts cloudprovider.N

//ociShapeGetter := ocicommon.CreateShapeGetter(computeClient)
ociShapeGetter := ocicommon.CreateShapeGetter(ocicommon.ShapeClientImpl{ComputeMgmtClient: computeMgmtClient, ComputeClient: computeClient})
ociTagsGetter := ocicommon.CreateTagsGetter()

registeredTaintsGetter := CreateRegisteredTaintsGetter()

Expand All @@ -145,6 +146,7 @@ func CreateNodePoolManager(cloudConfigPath string, discoveryOpts cloudprovider.N
computeClient: &computeClient,
staticNodePools: map[string]NodePool{},
ociShapeGetter: ociShapeGetter,
ociTagsGetter: ociTagsGetter,
registeredTaintsGetter: registeredTaintsGetter,
nodePoolCache: newNodePoolCache(&okeClient),
}
Expand Down Expand Up @@ -210,6 +212,7 @@ type ociManagerImpl struct {
okeClient okeClient
computeClient *core.ComputeClient
ociShapeGetter ocicommon.ShapeGetter
ociTagsGetter ocicommon.TagsGetter
registeredTaintsGetter RegisteredTaintsGetter
staticNodePools map[string]NodePool

Expand Down Expand Up @@ -521,7 +524,15 @@ func (m *ociManagerImpl) buildNodeFromTemplate(nodePool *oke.NodePool) (*apiv1.N
Capacity: apiv1.ResourceList{},
}

shape, err := m.ociShapeGetter.GetNodePoolShape(nodePool)
freeformTags, err := m.ociTagsGetter.GetNodePoolFreeformTags(nodePool)
if err != nil {
return nil, err
}
ephemeralStorage, err := getEphemeralResourceRequestsInBytes(freeformTags)
if err != nil {
klog.Error(err)
}
shape, err := m.ociShapeGetter.GetNodePoolShape(nodePool, ephemeralStorage)
if err != nil {
return nil, err
}
Expand All @@ -542,11 +553,16 @@ func (m *ociManagerImpl) buildNodeFromTemplate(nodePool *oke.NodePool) (*apiv1.N
})
}

if err != nil {
return nil, err
}
node.Status.Capacity[apiv1.ResourcePods] = *resource.NewQuantity(110, resource.DecimalSI)

node.Status.Capacity[apiv1.ResourceCPU] = *resource.NewQuantity(int64(shape.CPU), resource.DecimalSI)
node.Status.Capacity[apiv1.ResourceMemory] = *resource.NewQuantity(int64(shape.MemoryInBytes), resource.DecimalSI)
node.Status.Capacity[ipconsts.ResourceGPU] = *resource.NewQuantity(int64(shape.GPU), resource.DecimalSI)
if ephemeralStorage != -1 {
node.Status.Capacity[apiv1.ResourceEphemeralStorage] = *resource.NewQuantity(ephemeralStorage, resource.DecimalSI)
}

node.Status.Allocatable = node.Status.Capacity

Expand Down Expand Up @@ -634,6 +650,23 @@ func addTaintToSpec(node *apiv1.Node, taintKey string, effect apiv1.TaintEffect)
return true
}

func getEphemeralResourceRequestsInBytes(tags map[string]string) (int64, error) {
for key, value := range tags {
if key == npconsts.EphemeralStorageSize {
klog.V(4).Infof("ephemeral-storage size set with value : %v", value)
value = strings.ReplaceAll(value, " ", "")
resourceSize, err := resource.ParseQuantity(value)
if err != nil {
return -1, err
}
klog.V(4).Infof("ephemeral-storage size = %v (%v)", resourceSize.Value(), resourceSize.Format)
return resourceSize.Value(), nil
}
}
klog.V(4).Infof("ephemeral-storage size not set as part of the nodepool's freeform tags")
return -1, nil
}

// IsConflict checks if the error is a conflict
func IsConflict(err error) bool {
return ReasonForError(err) == metav1.StatusReasonConflict
Expand Down
Loading