Skip to content

Commit

Permalink
Merge pull request kubernetes#4452 from airbnb/es--grpc-expander-plugin
Browse files Browse the repository at this point in the history
Add gRPC expander plugin
  • Loading branch information
k8s-ci-robot authored and Anton Kirillov committed Oct 27, 2022
1 parent 6e22b10 commit c98a480
Show file tree
Hide file tree
Showing 351 changed files with 108,818 additions and 61,370 deletions.
4 changes: 4 additions & 0 deletions cluster-autoscaler/config/autoscaling_options.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ type AutoscalingOptions struct {
EstimatorName string
// ExpanderNames sets the chain of node group expanders to be used in scale up
ExpanderNames string
// GRPCExpanderCert is the location of the cert passed to the gRPC server for TLS when using the gRPC expander
GRPCExpanderCert string
// GRPCExpanderURL is the url of the gRPC server when using the gRPC expander
GRPCExpanderURL string
// IgnoreDaemonSetsUtilization is whether CA will ignore DaemonSet pods when calculating resource utilization for scaling down
IgnoreDaemonSetsUtilization bool
// IgnoreMirrorPodsUtilization is whether CA will ignore Mirror pods when calculating resource utilization for scaling down
Expand Down
4 changes: 2 additions & 2 deletions cluster-autoscaler/core/autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,8 @@ func initializeDefaultOptions(opts *AutoscalerOptions) error {
opts.CloudProvider = cloudBuilder.NewCloudProvider(opts.AutoscalingOptions)
}
if opts.ExpanderStrategy == nil {
expanderStrategy, err := factory.ExpanderStrategyFromStrings(strings.Split(opts.ExpanderNames, ","),
opts.CloudProvider, opts.AutoscalingKubeClients, opts.KubeClient, opts.ConfigNamespace)
expanderStrategy, err := factory.ExpanderStrategyFromStrings(strings.Split(opts.ExpanderNames, ","), opts.CloudProvider,
opts.AutoscalingKubeClients, opts.KubeClient, opts.ConfigNamespace, opts.GRPCExpanderCert, opts.GRPCExpanderURL)
if err != nil {
return err
}
Expand Down
4 changes: 3 additions & 1 deletion cluster-autoscaler/expander/expander.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import (

var (
// AvailableExpanders is a list of available expander options
AvailableExpanders = []string{RandomExpanderName, MostPodsExpanderName, LeastWasteExpanderName, PriceBasedExpanderName, PriorityBasedExpanderName}
AvailableExpanders = []string{RandomExpanderName, MostPodsExpanderName, LeastWasteExpanderName, PriceBasedExpanderName, PriorityBasedExpanderName, GRPCExpanderName}
// RandomExpanderName selects a node group at random
RandomExpanderName = "random"
// MostPodsExpanderName selects a node group that fits the most pods
Expand All @@ -36,6 +36,8 @@ var (
PriceBasedExpanderName = "price"
// PriorityBasedExpanderName selects a node group based on a user-configured priorities assigned to group names
PriorityBasedExpanderName = "priority"
// GRPCExpanderName uses the gRPC client expander to call to an external gRPC server to select a node group for scale up
GRPCExpanderName = "grpc"
)

// Option describes an option to expand the cluster.
Expand Down
7 changes: 5 additions & 2 deletions cluster-autoscaler/expander/factory/expander_factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,22 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"k8s.io/autoscaler/cluster-autoscaler/context"
"k8s.io/autoscaler/cluster-autoscaler/expander"
"k8s.io/autoscaler/cluster-autoscaler/expander/grpcplugin"
"k8s.io/autoscaler/cluster-autoscaler/expander/mostpods"
"k8s.io/autoscaler/cluster-autoscaler/expander/price"
"k8s.io/autoscaler/cluster-autoscaler/expander/priority"
"k8s.io/autoscaler/cluster-autoscaler/expander/random"
"k8s.io/autoscaler/cluster-autoscaler/expander/waste"
"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
"k8s.io/autoscaler/cluster-autoscaler/utils/kubernetes"

kube_client "k8s.io/client-go/kubernetes"
)

// ExpanderStrategyFromStrings creates an expander.Strategy according to the names of the expanders passed in
// take in whole opts and access stuff here
func ExpanderStrategyFromStrings(expanderFlags []string, cloudProvider cloudprovider.CloudProvider,
autoscalingKubeClients *context.AutoscalingKubeClients, kubeClient kube_client.Interface,
configNamespace string) (expander.Strategy, errors.AutoscalerError) {
configNamespace string, GRPCExpanderCert string, GRPCExpanderURL string) (expander.Strategy, errors.AutoscalerError) {
var filters []expander.Filter
seenExpanders := map[string]struct{}{}
strategySeen := false
Expand Down Expand Up @@ -67,6 +68,8 @@ func ExpanderStrategyFromStrings(expanderFlags []string, cloudProvider cloudprov
stopChannel := make(chan struct{})
lister := kubernetes.NewConfigMapListerForNamespace(kubeClient, stopChannel, configNamespace)
filters = append(filters, priority.NewFilter(lister.ConfigMaps(configNamespace), autoscalingKubeClients.Recorder))
case expander.GRPCExpanderName:
filters = append(filters, grpcplugin.NewFilter(GRPCExpanderCert, GRPCExpanderURL))
default:
return nil, errors.NewAutoscalerError(errors.InternalError, "Expander %s not supported", expanderFlag)
}
Expand Down
41 changes: 41 additions & 0 deletions cluster-autoscaler/expander/grpcplugin/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# gRPC Expander for Cluster Autoscaler

## Introduction
This expander functions as a gRPC client, and will pass expansion options to an external gRPC server.
The external server will use this information to make a decision on which Node Group to expand, and return an option to expand.

## Motivation

This expander gives users very fine grained control over which option they'd like to expand.
The gRPC server must be implemented by the user, but the logic can be developed out of band with Cluster Autoscaler.
There are a wide variety of use cases here. Some examples are as follows:
* A tiered weighted random strategy can be implemented, instead of a static priority ladder offered by the priority expander.
* A strategy to encapsulate business logic specific to a user but not all users of Cluster Autoscaler
* A strategy to take into account the dynamic fluctuating prices of the spot instance market

## Configuration options
As using this expander requires communication with another service, users must specify a few options as CLI arguments.

```yaml
--grpcExpanderUrl
```
URL of the gRPC Expander server, for CA to communicate with.
```yaml
--grpcExpanderCert
```
Location of the volume mounted certificate of the gRPC server if it is configured to communicate over TLS

## gRPC Expander Server Setup
The gRPC server can be set up in many ways, but a simple example is described below.
An example of a barebones gRPC Exapnder Server can be found in the `example` directory under `fake_grpc_server.go` file. This is meant to be copied elsewhere and deployed as a separate
service. Note that the `protos/expander.pb.go` generated protobuf code will also need to be copied and used to serialize/deserizle the Options passed from CA.
Communication between Cluster Autoscaler and the gRPC Server will occur over native kube-proxy. To use this, note the Service and Namespace the gRPC server is deployed in.

Deploy the gRPC Expander Server as a separate app, listening on a specifc port number.
Start Cluster Autoscaler with the `--grpcExapnderURl=SERVICE_NAME.NAMESPACE_NAME.svc.cluster.local:PORT_NUMBER` flag, as well as `--grpcExpanderCert` pointed at the location of the volume mounted certificate of the gRPC server.

## Details

The gRPC client currently transforms nodeInfo objects passed into the expander to v1.Node objects to save rpc call throughput. As such, the gRPC server will not have access to daemonsets and static pods running on each node.


104 changes: 104 additions & 0 deletions cluster-autoscaler/expander/grpcplugin/example/fake_grpc_server.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package example

import (
"context"
"fmt"
"log"
"net"

"google.golang.org/grpc"
"google.golang.org/grpc/credentials"
"k8s.io/autoscaler/cluster-autoscaler/expander/grpcplugin/protos"
)

// This code is meant to be used as starter code, deployed as a separate app, not in Cluster Autoscaler.
// This serves as the gRPC Expander Server counterpart to the client which lives in this repo
// main.go of said application should simply pass in paths to (optional)cert, (optional)private key, and port, and call Serve to start listening
// copy the protos/expander.pb.go to your other application's repo, so it has access to the protobuf definitions

// Serve should be called by the main() function in main.go of the Expander Server repo to start serving
func Serve(certPath string, keyPath string, port uint) {

var grpcServer *grpc.Server

// If credentials are passed in, use them
if certPath != "" && keyPath != "" {
log.Printf("Using certFile: %v and keyFile: %v", certPath, keyPath)
tlsCredentials, err := credentials.NewServerTLSFromFile(certPath, keyPath)
if err != nil {
log.Fatal("cannot load TLS credentials: ", err)
}
grpcServer = grpc.NewServer(grpc.Creds(tlsCredentials))
} else {
grpcServer = grpc.NewServer()
}

netListener := getNetListener(port)

expanderServerImpl := NewExpanderServerImpl()

protos.RegisterExpanderServer(grpcServer, expanderServerImpl)

// start the server
log.Println("Starting server on port ", port)
if err := grpcServer.Serve(netListener); err != nil {
log.Fatalf("failed to serve: %s", err)
}
}

func getNetListener(port uint) net.Listener {
lis, err := net.Listen("tcp", fmt.Sprintf(":%d", port))
if err != nil {
log.Fatalf("failed to listen: %v", err)
panic(fmt.Sprintf("failed to listen: %v", err))
}

return lis
}

// ExpanderServerImpl is an implementation of Expander Server from proto definition
type ExpanderServerImpl struct{}

// NewExpanderServerImpl is this Expander's implementation of the server
func NewExpanderServerImpl() *ExpanderServerImpl {
return &ExpanderServerImpl{}
}

// BestOptions method filters out the best options of all options passed from the gRPC Client in CA, according to the defined strategy.
func (ServerImpl *ExpanderServerImpl) BestOptions(ctx context.Context, req *protos.BestOptionsRequest) (*protos.BestOptionsResponse, error) {
opts := req.GetOptions()
log.Printf("Received BestOption Request with %v options", len(opts))

// This strategy simply chooses the Option with the longest NodeGroupID name, but can be replaced with any arbitrary logic
longest := 0
var choice *protos.Option
for _, opt := range opts {
log.Println(opt.NodeGroupId)
if len(opt.NodeGroupId) > longest {
choice = opt
}
}

log.Print("returned bestOptions with option: ", choice.NodeGroupId)

// Return just one option for now
return &protos.BestOptionsResponse{
Options: []*protos.Option{choice},
}, nil
}
30 changes: 30 additions & 0 deletions cluster-autoscaler/expander/grpcplugin/example/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package example

import "flag"

func main() {

certPath := flag.String("cert-path", "", "Path to cert file for gRPC Expander Server")
keyPath := flag.String("key-path", "", "Path to private key for gRPC Expander Server")
port := flag.Uint("port", 7000, "Port number for server to listen on")

flag.Parse()

Serve(*certPath, *keyPath, *port)
}
143 changes: 143 additions & 0 deletions cluster-autoscaler/expander/grpcplugin/grpc_client.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package grpcplugin

import (
"context"
"log"
"time"

v1 "k8s.io/api/core/v1"
"k8s.io/autoscaler/cluster-autoscaler/expander"
"k8s.io/autoscaler/cluster-autoscaler/expander/grpcplugin/protos"
"k8s.io/klog/v2"
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"

"google.golang.org/grpc"
"google.golang.org/grpc/credentials"
)

const gRPCTimeout = 5 * time.Second

type grpcclientstrategy struct {
grpcClient protos.ExpanderClient
}

// NewFilter returns an expansion filter that creates a gRPC client, and calls out to a gRPC server
func NewFilter(expanderCert string, expanderUrl string) expander.Filter {
client := createGRPCClient(expanderCert, expanderUrl)
if client == nil {
return &grpcclientstrategy{grpcClient: nil}
}
return &grpcclientstrategy{grpcClient: client}
}

func createGRPCClient(expanderCert string, expanderUrl string) protos.ExpanderClient {
var dialOpt grpc.DialOption

if expanderCert == "" {
log.Fatalf("GRPC Expander Cert not specified, insecure connections not allowed")
return nil
}
creds, err := credentials.NewClientTLSFromFile(expanderCert, "")
if err != nil {
log.Fatalf("Failed to create TLS credentials %v", err)
return nil
}
dialOpt = grpc.WithTransportCredentials(creds)
klog.V(2).Infof("Dialing: %s with dialopt: %v", expanderUrl, dialOpt)
conn, err := grpc.Dial(expanderUrl, dialOpt)
if err != nil {
log.Fatalf("Fail to dial server: %v", err)
return nil
}
return protos.NewExpanderClient(conn)
}

func (g *grpcclientstrategy) BestOptions(expansionOptions []expander.Option, nodeInfo map[string]*schedulerframework.NodeInfo) []expander.Option {
if g.grpcClient == nil {
klog.Errorf("Incorrect gRPC client config, filtering no options")
return expansionOptions
}

// Transform inputs to gRPC inputs
grpcOptionsSlice, nodeGroupIDOptionMap := populateOptionsForGRPC(expansionOptions)
grpcNodeMap := populateNodeInfoForGRPC(nodeInfo)

// call gRPC server to get BestOption
klog.V(2).Infof("GPRC call of best options to server with %v options", len(nodeGroupIDOptionMap))
ctx, cancel := context.WithTimeout(context.Background(), gRPCTimeout)
defer cancel()
bestOptionsResponse, err := g.grpcClient.BestOptions(ctx, &protos.BestOptionsRequest{Options: grpcOptionsSlice, NodeMap: grpcNodeMap})
if err != nil {
klog.V(4).Info("GRPC call timed out, no options filtered")
return expansionOptions
}

if bestOptionsResponse == nil || bestOptionsResponse.Options == nil {
klog.V(4).Info("GRPC returned nil bestOptions, no options filtered")
return expansionOptions
}
// Transform back options slice
options := transformAndSanitizeOptionsFromGRPC(bestOptionsResponse.Options, nodeGroupIDOptionMap)
if options == nil {
klog.V(4).Info("Unable to sanitize GPRC returned bestOptions, no options filtered")
return expansionOptions
}
return options
}

// populateOptionsForGRPC creates a map of nodegroup ID and options, as well as a slice of Options objects for the gRPC call
func populateOptionsForGRPC(expansionOptions []expander.Option) ([]*protos.Option, map[string]expander.Option) {
grpcOptionsSlice := []*protos.Option{}
nodeGroupIDOptionMap := make(map[string]expander.Option)
for _, option := range expansionOptions {
nodeGroupIDOptionMap[option.NodeGroup.Id()] = option
grpcOptionsSlice = append(grpcOptionsSlice, newOptionMessage(option.NodeGroup.Id(), int32(option.NodeCount), option.Debug, option.Pods))
}
return grpcOptionsSlice, nodeGroupIDOptionMap
}

// populateNodeInfoForGRPC looks at the corresponding v1.Node object per NodeInfo object, and populates the grpcNodeInfoMap with these to pass over grpc
func populateNodeInfoForGRPC(nodeInfos map[string]*schedulerframework.NodeInfo) map[string]*v1.Node {
grpcNodeInfoMap := make(map[string]*v1.Node)
for nodeId, nodeInfo := range nodeInfos {
grpcNodeInfoMap[nodeId] = nodeInfo.Node()
}
return grpcNodeInfoMap
}

func transformAndSanitizeOptionsFromGRPC(bestOptionsResponseOptions []*protos.Option, nodeGroupIDOptionMap map[string]expander.Option) []expander.Option {
var options []expander.Option
for _, option := range bestOptionsResponseOptions {
if option == nil {
klog.Errorf("GRPC server returned nil Option")
continue
}
if _, ok := nodeGroupIDOptionMap[option.NodeGroupId]; ok {
options = append(options, nodeGroupIDOptionMap[option.NodeGroupId])
} else {
klog.Errorf("GRPC server returned invalid nodeGroup ID: ", option.NodeGroupId)
continue
}
}
return options
}

func newOptionMessage(nodeGroupId string, nodeCount int32, debug string, pods []*v1.Pod) *protos.Option {
return &protos.Option{NodeGroupId: nodeGroupId, NodeCount: nodeCount, Debug: debug, Pod: pods}
}
Loading

0 comments on commit c98a480

Please sign in to comment.