Skip to content

Commit

Permalink
Merge pull request #276 from kerthcet/document/api-reference
Browse files Browse the repository at this point in the history
Add recommendedConfigs to backendRuntime
  • Loading branch information
InftyAI-Agent authored Feb 18, 2025
2 parents d5ec014 + a719916 commit f4541b0
Show file tree
Hide file tree
Showing 50 changed files with 1,324 additions and 1,410 deletions.
61 changes: 27 additions & 34 deletions api/inference/v1alpha1/backendruntime_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,10 @@ package v1alpha1
import (
autoscalingv2 "k8s.io/api/autoscaling/v2"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

// BackendRuntimeArg is the preset arguments for easy to use.
// Three preset names are provided: default, speculative-decoding, model-parallelism,
// do not change the name.
type BackendRuntimeArg struct {
// Name represents the identifier of the backendRuntime argument.
// +kubebuilder:default=default
// +optional
Name *string `json:"name,omitempty"`
// Flags represents all the preset configurations.
// Flag around with {{ .CONFIG }} is a configuration waiting for render.
Flags []string `json:"flags,omitempty"`
}

// HPATrigger represents the configuration of the HorizontalPodAutoscaler.
// Inspired by kubernetes.io/pkg/apis/autoscaling/types.go#HorizontalPodAutoscalerSpec.
// Note: HPA component should be installed in prior.
Expand All @@ -55,17 +43,6 @@ type HPATrigger struct {
Behavior *autoscalingv2.HorizontalPodAutoscalerBehavior `json:"behavior,omitempty"`
}

// NamedScaleTrigger defines the rules to scale the workloads.
// Only one trigger cloud work at a time. The name is used to identify
// the trigger in backendRuntime.
type NamedScaleTrigger struct {
// Name represents the identifier of the scale trigger, e.g. some triggers defined for
// latency sensitive workloads, some are defined for throughput sensitive workloads.
Name string `json:"name,omitempty"`
// HPA represents the trigger configuration of the HorizontalPodAutoscaler.
HPA *HPATrigger `json:"hpa,omitempty"`
}

// ScaleTrigger defines the rules to scale the workloads.
// Only one trigger cloud work at a time, mostly used in Playground.
type ScaleTrigger struct {
Expand All @@ -83,6 +60,30 @@ type MultiHostCommands struct {
Worker []string `json:"worker,omitempty"`
}

// RecommendedConfig represents the recommended configurations for the backendRuntime,
// user can choose one of them to apply.
type RecommendedConfig struct {
// Name represents the identifier of the config.
Name string `json:"name"`
// Args represents all the arguments for the command.
// Argument around with {{ .CONFIG }} is a configuration waiting for render.
// +optional
Args []string `json:"args,omitempty"`
// Resources represents the resource requirements for backend, like cpu/mem,
// accelerators like GPU should not be defined here, but at the model flavors,
// or the values here will be overwritten.
// +optional
Resources *ResourceRequirements `json:"resources,omitempty"`
// SharedMemorySize represents the size of /dev/shm required in the runtime of
// inference workload.
// +optional
SharedMemorySize *resource.Quantity `json:"sharedMemorySize,omitempty"`
// ScaleTrigger defines the rules to scale the workloads.
// Only one trigger cloud work at a time.
// +optional
ScaleTrigger *ScaleTrigger `json:"scaleTrigger,omitempty"`
}

// BackendRuntimeSpec defines the desired state of BackendRuntime
type BackendRuntimeSpec struct {
// Commands represents the default commands for the backendRuntime.
Expand All @@ -98,16 +99,9 @@ type BackendRuntimeSpec struct {
// Version represents the default version of the backendRuntime.
// It will be appended to the image as a tag.
Version string `json:"version"`
// Args represents the preset arguments of the backendRuntime.
// They can be appended or overwritten by the Playground backendRuntimeConfig.
Args []BackendRuntimeArg `json:"args,omitempty"`
// Envs represents the environments set to the container.
// +optional
Envs []corev1.EnvVar `json:"envs,omitempty"`
// Resources represents the resource requirements for backendRuntime, like cpu/mem,
// accelerators like GPU should not be defined here, but at the model flavors,
// or the values here will be overwritten.
Resources ResourceRequirements `json:"resources"`
// Periodic probe of backend liveness.
// Backend will be restarted if the probe fails.
// Cannot be updated.
Expand All @@ -124,10 +118,9 @@ type BackendRuntimeSpec struct {
// when it might take a long time to load data or warm a cache, than during steady-state operation.
// +optional
StartupProbe *corev1.Probe `json:"startupProbe,omitempty"`
// ScaleTriggers represents a set of triggers preset to be used by Playground.
// If Playground not specify the scale trigger, the 0-index trigger will be used.
// RecommendedConfigs represents the recommended configurations for the backendRuntime.
// +optional
ScaleTriggers []NamedScaleTrigger `json:"scaleTriggers,omitempty"`
RecommendedConfigs []RecommendedConfig `json:"recommendedConfigs,omitempty"`
}

// BackendRuntimeStatus defines the observed state of BackendRuntime
Expand Down
56 changes: 20 additions & 36 deletions api/inference/v1alpha1/config_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,29 +28,43 @@ const (
)

type BackendRuntimeConfig struct {
// Name represents the inference backend under the hood, e.g. vLLM.
// BackendName represents the inference backend under the hood, e.g. vLLM.
// +kubebuilder:default=vllm
// +optional
Name *BackendName `json:"name,omitempty"`
BackendName *BackendName `json:"backendName,omitempty"`
// Version represents the backend version if you want a different one
// from the default version.
// +optional
Version *string `json:"version,omitempty"`
// Envs represents the environments set to the container.
// +optional
Envs []corev1.EnvVar `json:"envs,omitempty"`

// ConfigName represents the recommended configuration name for the backend,
// It will be inferred from the models in the runtime if not specified, e.g. default,
// speculative-decoding or model-parallelism.
ConfigName *string `json:"configName,omitempty"`
// Args represents all the arguments for the command.
// Argument around with {{ .CONFIG }} is a configuration waiting for render.
// +optional
// Args defined here will "append" the args in the recommendedConfig.
// +optional
Args []string `json:"args,omitempty"`
// Resources represents the resource requirements for backend, like cpu/mem,
// accelerators like GPU should not be defined here, but at the model flavors,
// or the values here will be overwritten.
// Resources defined here will "overwrite" the resources in the recommendedConfig.
// +optional
Resources *ResourceRequirements `json:"resources,omitempty"`
// SharedMemorySize represents the size of /dev/shm required in the runtime of
// inference workload.
// SharedMemorySize defined here will "overwrite" the sharedMemorySize in the recommendedConfig.
// +optional
SharedMemorySize *resource.Quantity `json:"sharedMemorySize,omitempty"`
// Args represents the specified arguments of the backendRuntime,
// will be append to the backendRuntime.spec.Args.
Args *BackendRuntimeArg `json:"args,omitempty"`
// ScaleTrigger defines the rules to scale the workloads.
// Only one trigger cloud work at a time, mostly used in Playground.
// ScaleTrigger defined here will "overwrite" the scaleTrigger in the recommendedConfig.
// +optional
ScaleTrigger *ScaleTrigger `json:"scaleTrigger,omitempty"`
}

// TODO: Do not support DRA yet, we can support that once needed.
Expand All @@ -66,33 +80,3 @@ type ResourceRequirements struct {
// +optional
Requests corev1.ResourceList `json:"requests,omitempty"`
}

// ScaleTriggerRef refers to the configured scaleTrigger in the backendRuntime.
type ScaleTriggerRef struct {
// Name represents the scale trigger name defined in the backendRuntime.scaleTriggers.
Name string `json:"name"`
}

type ElasticConfig struct {
// MinReplicas indicates the minimum number of inference workloads based on the traffic.
// Default to 1.
// MinReplicas couldn't be 0 now, will support serverless in the future.
// +kubebuilder:default=1
// +optional
MinReplicas *int32 `json:"minReplicas,omitempty"`
// MaxReplicas indicates the maximum number of inference workloads based on the traffic.
// Default to nil means there's no limit for the instance number.
// +optional
MaxReplicas *int32 `json:"maxReplicas,omitempty"`
// ScaleTriggerRef refers to the configured scaleTrigger in the backendRuntime
// with tuned target value.
// ScaleTriggerRef and ScaleTrigger can't be set at the same time.
// +optional
ScaleTriggerRef *ScaleTriggerRef `json:"scaleTriggerRef,omitempty"`
// ScaleTrigger defines a set of triggers to scale the workloads.
// If not defined, trigger configured in backendRuntime will be used,
// otherwise, trigger defined here will overwrite the defaulted ones.
// ScaleTriggerRef and ScaleTrigger can't be set at the same time.
// +optional
ScaleTrigger *ScaleTrigger `json:"scaleTrigger,omitempty"`
}
15 changes: 13 additions & 2 deletions api/inference/v1alpha1/playground_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,22 @@ type PlaygroundSpec struct {
BackendRuntimeConfig *BackendRuntimeConfig `json:"backendRuntimeConfig,omitempty"`
// ElasticConfig defines the configuration for elastic usage,
// e.g. the max/min replicas.
// Note: this requires to install the HPA first or will report error.
// +optional
ElasticConfig *ElasticConfig `json:"elasticConfig,omitempty"`
}

type ElasticConfig struct {
// MinReplicas indicates the minimum number of inference workloads based on the traffic.
// Default to 1.
// MinReplicas couldn't be 0 now, will support serverless in the future.
// +kubebuilder:default=1
// +optional
MinReplicas *int32 `json:"minReplicas,omitempty"`
// MaxReplicas indicates the maximum number of inference workloads based on the traffic.
// Default to nil means there's no limit for the instance number.
// +optional
MaxReplicas *int32 `json:"maxReplicas,omitempty"`
}

const (
// PlaygroundProgressing means the Playground is progressing now, such as waiting for the
// inference service creation, rolling update or scaling up and down.
Expand Down
Loading

0 comments on commit f4541b0

Please sign in to comment.