Skip to content

Commit

Permalink
Optional Feature - Skip time-consuming findInstanceByDetails API call…
Browse files Browse the repository at this point in the history
… by adding non_pool_member annotation to the node.

This feature can be turned on by setting environment variable OCI_USE_NON_MEMBER_ANNOTATION to true.

For example, if a node is not belong to any instance pool,
during the very first GetInstancePoolForInstance call, the node will be annotated with
    oci.oraclecloud.com/instancepool-id: non_pool_member
After new annotation is added to the node, all future GetInstancePoolForInstance calls related to
the non-pool-member node will skip the time-consuming findInstanceByDetails API call.

Minor refactoring - when retrieving pool size from instancePoolCache object, use the pool size defined inside the poolCache map instead.
  • Loading branch information
aaron.tam committed May 6, 2022
1 parent c550b77 commit 4a8076d
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 26 deletions.
5 changes: 3 additions & 2 deletions cluster-autoscaler/cloudprovider/oci/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,9 @@ Configuration via environment-variables:
- `OCI_USE_INSTANCE_PRINCIPAL` - Whether to use Instance Principals for authentication rather than expecting an OCI config file to be mounted in the container. Defaults to false.
- `OCI_REGION` - **Required** when using Instance Principals. e.g. `OCI_REGION=us-phoenix-1`. See [region list](https://docs.oracle.com/en-us/iaas/Content/General/Concepts/regions.htm) for identifiers.
- `OCI_COMPARTMENT_ID` - Restrict the cluster-autoscaler to instance-pools in a single compartment. When unset, the cluster-autoscaler will manage each specified instance-pool no matter which compartment they are in.
- `OCI_REFRESH_INTERVAL` - Optional refresh interval to sync internal cache with OCI API defaults to `2m`.
- `OCI_COMPARTMENT_ID` - Restrict the cluster-autoscaler to instance-pools in a single compartment. When unset, the cluster-autoscaler will manage each specified instance-pool no matter which compartment they are in.
- `OCI_REFRESH_INTERVAL` - Optional refresh interval to sync internal cache with OCI API defaults to `2m`.
- `OCI_USE_NON_POOL_MEMBER_ANNOTATION` - Optional. If true, the node will be annotated as non-pool-member if it doesn't belong to any instance pool and the time-consuming instance pool lookup will be skipped.
## Deployment
Expand Down
20 changes: 11 additions & 9 deletions cluster-autoscaler/cloudprovider/oci/oci_cloud_provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,12 @@ import (
)

const (
ociUseInstancePrincipalEnvVar = "OCI_USE_INSTANCE_PRINCIPAL"
ociCompartmentEnvVar = "OCI_COMPARTMENT_ID"
ociRegionEnvVar = "OCI_REGION"
ociRefreshInterval = "OCI_REFRESH_INTERVAL"
ociAnnotationCompartmentID = "oci.oraclecloud.com/compartment-id"
ociUseInstancePrincipalEnvVar = "OCI_USE_INSTANCE_PRINCIPAL"
ociUseNonPoolMemberAnnotationEnvVar = "OCI_USE_NON_POOL_MEMBER_ANNOTATION"
ociCompartmentEnvVar = "OCI_COMPARTMENT_ID"
ociRegionEnvVar = "OCI_REGION"
ociRefreshInterval = "OCI_REFRESH_INTERVAL"
ociAnnotationCompartmentID = "oci.oraclecloud.com/compartment-id"
// ResourceGPU is the GPU resource type
ResourceGPU apiv1.ResourceName = "nvidia.com/gpu"
defaultRefreshInterval = 5 * time.Minute
Expand All @@ -51,10 +52,11 @@ type OciCloudProvider struct {
// CloudConfig holds the cloud config for OCI provider.
type CloudConfig struct {
Global struct {
RefreshInterval time.Duration `gcfg:"refresh-interval"`
CompartmentID string `gcfg:"compartment-id"`
Region string `gcfg:"region"`
UseInstancePrinciples bool `gcfg:"use-instance-principals"`
RefreshInterval time.Duration `gcfg:"refresh-interval"`
CompartmentID string `gcfg:"compartment-id"`
Region string `gcfg:"region"`
UseInstancePrinciples bool `gcfg:"use-instance-principals"`
UseNonMemberAnnotation bool `gcfg:"use-non-memeber-annotation"`
}
}

Expand Down
3 changes: 3 additions & 0 deletions cluster-autoscaler/cloudprovider/oci/oci_instance_pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ const (
instanceIDLabelSuffix = "instance-id_suffix"
ociInstancePoolIDAnnotation = "oci.oraclecloud.com/instancepool-id"
ociInstancePoolResourceIdent = "instancepool"

// Overload ociInstancePoolIDAnnotation to indicate a kubernetes node doesn't belong to any OCI Instance Pool.
ociInstancePoolIDNonPoolMember = "non_pool_member"
)

// InstancePoolNodeGroup implements the NodeGroup interface using OCI instance pools.
Expand Down
17 changes: 5 additions & 12 deletions cluster-autoscaler/cloudprovider/oci/oci_instance_pool_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ type instancePoolCache struct {
mu sync.Mutex
poolCache map[string]*core.InstancePool
instanceSummaryCache map[string]*[]core.InstanceSummary
targetSize map[string]int
unownedInstances map[OciRef]bool

computeManagementClient ComputeMgmtClient
Expand All @@ -64,7 +63,6 @@ func newInstancePoolCache(computeManagementClient ComputeMgmtClient, computeClie
return &instancePoolCache{
poolCache: map[string]*core.InstancePool{},
instanceSummaryCache: map[string]*[]core.InstanceSummary{},
targetSize: map[string]int{},
unownedInstances: map[OciRef]bool{},
computeManagementClient: computeManagementClient,
computeClient: computeClient,
Expand Down Expand Up @@ -145,7 +143,7 @@ func (c *instancePoolCache) removeInstance(instancePool InstancePoolNodeGroup, i
if err == nil {
c.mu.Lock()
// Decrease pool size in cache since IsDecrementSize was true
c.targetSize[instancePool.Id()] -= 1
c.poolCache[instancePool.Id()].Size = common.Int(*c.poolCache[instancePool.Id()].Size - 1)
c.mu.Unlock()
return true
}
Expand All @@ -171,11 +169,6 @@ func (c *instancePoolCache) findInstanceByDetails(ociInstance OciRef) (*OciRef,

// Look for the instance in each of the specified pool(s)
for _, nextInstancePool := range c.poolCache {
// Skip searching instance pool if it's instance count is 0.
if *nextInstancePool.Size == 0 {
klog.V(4).Infof("skipping over instance pool %s since it is empty", *nextInstancePool.Id)
continue
}
// Skip searching instance pool if we happen tp know (prior labels) the pool ID and this is not it
if (ociInstance.PoolID != "") && (ociInstance.PoolID != *nextInstancePool.Id) {
klog.V(5).Infof("skipping over instance pool %s since it is not the one we are looking for", *nextInstancePool.Id)
Expand Down Expand Up @@ -289,7 +282,7 @@ func (c *instancePoolCache) setInstancePool(np *core.InstancePool) {
defer c.mu.Unlock()

c.poolCache[*np.Id] = np
c.targetSize[*np.Id] = *np.Size
c.poolCache[*np.Id].Size = np.Size
}

func (c *instancePoolCache) getInstanceSummaries(poolID string) (*[]core.InstanceSummary, error) {
Expand Down Expand Up @@ -362,7 +355,7 @@ func (c *instancePoolCache) setSize(instancePoolID string, size int) error {
}

c.mu.Lock()
c.targetSize[instancePoolID] = size
c.poolCache[instancePoolID].Size = common.Int(size)
c.mu.Unlock()

return nil
Expand Down Expand Up @@ -508,12 +501,12 @@ func (c *instancePoolCache) getSize(id string) (int, error) {
c.mu.Lock()
defer c.mu.Unlock()

size, ok := c.targetSize[id]
pool, ok := c.poolCache[id]
if !ok {
return -1, errors.New("target size not found")
}

return size, nil
return *pool.Size, nil
}

// maxScalingWaitTime estimates the maximum amount of time, as a duration, that to scale size instances.
Expand Down
18 changes: 16 additions & 2 deletions cluster-autoscaler/cloudprovider/oci/oci_instance_pool_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ type InstancePoolManagerImpl struct {
// caches the instance pool and instance summary objects received from OCI.
// All interactions with OCI's API should go through the poolCache.
instancePoolCache *instancePoolCache
kubeClient kubernetes.Interface
}

// CreateInstancePoolManager constructs the InstancePoolManager object.
Expand Down Expand Up @@ -119,6 +120,9 @@ func CreateInstancePoolManager(cloudConfigPath string, discoveryOpts cloudprovid
cloudConfig.Global.RefreshInterval = defaultRefreshInterval
}
}
if os.Getenv(ociUseNonPoolMemberAnnotationEnvVar) == "true" {
cloudConfig.Global.UseNonMemberAnnotation = true
}

clientConfig := common.CustomClientConfiguration{
RetryPolicy: newRetryPolicy(),
Expand Down Expand Up @@ -175,6 +179,7 @@ func CreateInstancePoolManager(cloudConfigPath string, discoveryOpts cloudprovid
staticInstancePools: map[string]*InstancePoolNodeGroup{},
shapeGetter: createShapeGetter(ShapeClientImpl{computeMgmtClient: computeMgmtClient, computeClient: computeClient}),
instancePoolCache: newInstancePoolCache(&computeMgmtClient, &computeClient, &networkClient),
kubeClient: kubeClient,
}

// Contains all the specs from the args that give us the pools.
Expand Down Expand Up @@ -324,6 +329,12 @@ func (m *InstancePoolManagerImpl) GetInstancePoolNodes(ip InstancePoolNodeGroup)
// GetInstancePoolForInstance returns InstancePool to which the given instance belongs. If
// PoolID is not set on the specified OciRef, we will look for a match.
func (m *InstancePoolManagerImpl) GetInstancePoolForInstance(instanceDetails OciRef) (*InstancePoolNodeGroup, error) {
if m.cfg.Global.UseNonMemberAnnotation && instanceDetails.PoolID == ociInstancePoolIDNonPoolMember {
// Instance is not part of a configured pool. Return early and avoid additional API calls.
klog.V(4).Infof(instanceDetails.Name + " is not a member of any of the specified instance pool(s) and already annotated as " +
ociInstancePoolIDNonPoolMember)
return nil, errInstanceInstancePoolNotFound
}

if instanceDetails.CompartmentID == "" {
// cfg.Global.CompartmentID would be set to tenancy OCID at runtime if compartment was not set.
Expand All @@ -336,17 +347,20 @@ func (m *InstancePoolManagerImpl) GetInstancePoolForInstance(instanceDetails Oci
return m.staticInstancePools[instanceDetails.PoolID], nil
}

kubeClient := m.kubeClient

// Details are missing from this instance.
// Try to resolve them, though it may not be a member of an instance-pool we manage.
resolvedInstanceDetails, err := m.instancePoolCache.findInstanceByDetails(instanceDetails)
if err != nil {
if m.cfg.Global.UseNonMemberAnnotation && err == errInstanceInstancePoolNotFound {
_ = annotateNode(kubeClient, instanceDetails.Name, ociInstancePoolIDAnnotation, ociInstancePoolIDNonPoolMember)
}
return nil, err
} else if resolvedInstanceDetails == nil {
return nil, nil
}

kubeClient := m.staticInstancePools[resolvedInstanceDetails.PoolID].kubeClient

// Optionally annotate & label the node so that it does not need to be searched for in subsequent iterations.
_ = annotateNode(kubeClient, resolvedInstanceDetails.Name, ociInstanceIDAnnotation, resolvedInstanceDetails.InstanceID)
_ = annotateNode(kubeClient, resolvedInstanceDetails.Name, ociInstancePoolIDAnnotation, resolvedInstanceDetails.PoolID)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ func TestInstancePoolFromArgs(t *testing.T) {
func TestGetSetInstancePoolSize(t *testing.T) {

nodePoolCache := newInstancePoolCache(computeManagementClient, computeClient, virtualNetworkClient)
nodePoolCache.targetSize["ocid1.instancepool.oc1.phx.aaaaaaaai"] = 2
nodePoolCache.poolCache["ocid1.instancepool.oc1.phx.aaaaaaaai"] = &core.InstancePool{Size: common.Int(2)}

manager := &InstancePoolManagerImpl{instancePoolCache: nodePoolCache}
size, err := manager.GetInstancePoolSize(InstancePoolNodeGroup{id: "ocid1.instancepool.oc1.phx.aaaaaaaai"})
Expand Down

0 comments on commit 4a8076d

Please sign in to comment.