Merge pull request #276 from kerthcet/document/api-reference

Add recommendedConfigs to backendRuntime
InftyAI · Feb 18, 2025 · f4541b0 · f4541b0
2 parents d5ec014 + a719916
commit f4541b0
Show file tree

Hide file tree

Showing 50 changed files with 1,324 additions and 1,410 deletions.
diff --git a/api/inference/v1alpha1/backendruntime_types.go b/api/inference/v1alpha1/backendruntime_types.go
@@ -19,22 +19,10 @@ package v1alpha1
 import (
 	autoscalingv2 "k8s.io/api/autoscaling/v2"
 	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 )
 
-// BackendRuntimeArg is the preset arguments for easy to use.
-// Three preset names are provided: default, speculative-decoding, model-parallelism,
-// do not change the name.
-type BackendRuntimeArg struct {
-	// Name represents the identifier of the backendRuntime argument.
-	// +kubebuilder:default=default
-	// +optional
-	Name *string `json:"name,omitempty"`
-	// Flags represents all the preset configurations.
-	// Flag around with {{ .CONFIG }} is a configuration waiting for render.
-	Flags []string `json:"flags,omitempty"`
-}
-
 // HPATrigger represents the configuration of the HorizontalPodAutoscaler.
 // Inspired by kubernetes.io/pkg/apis/autoscaling/types.go#HorizontalPodAutoscalerSpec.
 // Note: HPA component should be installed in prior.
@@ -55,17 +43,6 @@ type HPATrigger struct {
 	Behavior *autoscalingv2.HorizontalPodAutoscalerBehavior `json:"behavior,omitempty"`
 }
 
-// NamedScaleTrigger defines the rules to scale the workloads.
-// Only one trigger cloud work at a time. The name is used to identify
-// the trigger in backendRuntime.
-type NamedScaleTrigger struct {
-	// Name represents the identifier of the scale trigger, e.g. some triggers defined for
-	// latency sensitive workloads, some are defined for throughput sensitive workloads.
-	Name string `json:"name,omitempty"`
-	// HPA represents the trigger configuration of the HorizontalPodAutoscaler.
-	HPA *HPATrigger `json:"hpa,omitempty"`
-}
-
 // ScaleTrigger defines the rules to scale the workloads.
 // Only one trigger cloud work at a time, mostly used in Playground.
 type ScaleTrigger struct {
@@ -83,6 +60,30 @@ type MultiHostCommands struct {
 	Worker []string `json:"worker,omitempty"`
 }
 
+// RecommendedConfig represents the recommended configurations for the backendRuntime,
+// user can choose one of them to apply.
+type RecommendedConfig struct {
+	// Name represents the identifier of the config.
+	Name string `json:"name"`
+	// Args represents all the arguments for the command.
+	// Argument around with {{ .CONFIG }} is a configuration waiting for render.
+	// +optional
+	Args []string `json:"args,omitempty"`
+	// Resources represents the resource requirements for backend, like cpu/mem,
+	// accelerators like GPU should not be defined here, but at the model flavors,
+	// or the values here will be overwritten.
+	// +optional
+	Resources *ResourceRequirements `json:"resources,omitempty"`
+	// SharedMemorySize represents the size of /dev/shm required in the runtime of
+	// inference workload.
+	// +optional
+	SharedMemorySize *resource.Quantity `json:"sharedMemorySize,omitempty"`
+	// ScaleTrigger defines the rules to scale the workloads.
+	// Only one trigger cloud work at a time.
+	// +optional
+	ScaleTrigger *ScaleTrigger `json:"scaleTrigger,omitempty"`
+}
+
 // BackendRuntimeSpec defines the desired state of BackendRuntime
 type BackendRuntimeSpec struct {
 	// Commands represents the default commands for the backendRuntime.
@@ -98,16 +99,9 @@ type BackendRuntimeSpec struct {
 	// Version represents the default version of the backendRuntime.
 	// It will be appended to the image as a tag.
 	Version string `json:"version"`
-	// Args represents the preset arguments of the backendRuntime.
-	// They can be appended or overwritten by the Playground backendRuntimeConfig.
-	Args []BackendRuntimeArg `json:"args,omitempty"`
 	// Envs represents the environments set to the container.
 	// +optional
 	Envs []corev1.EnvVar `json:"envs,omitempty"`
-	// Resources represents the resource requirements for backendRuntime, like cpu/mem,
-	// accelerators like GPU should not be defined here, but at the model flavors,
-	// or the values here will be overwritten.
-	Resources ResourceRequirements `json:"resources"`
 	// Periodic probe of backend liveness.
 	// Backend will be restarted if the probe fails.
 	// Cannot be updated.
@@ -124,10 +118,9 @@ type BackendRuntimeSpec struct {
 	// when it might take a long time to load data or warm a cache, than during steady-state operation.
 	// +optional
 	StartupProbe *corev1.Probe `json:"startupProbe,omitempty"`
-	// ScaleTriggers represents a set of triggers preset to be used by Playground.
-	// If Playground not specify the scale trigger, the 0-index trigger will be used.
+	// RecommendedConfigs represents the recommended configurations for the backendRuntime.
 	// +optional
-	ScaleTriggers []NamedScaleTrigger `json:"scaleTriggers,omitempty"`
+	RecommendedConfigs []RecommendedConfig `json:"recommendedConfigs,omitempty"`
 }
 
 // BackendRuntimeStatus defines the observed state of BackendRuntime

diff --git a/api/inference/v1alpha1/config_types.go b/api/inference/v1alpha1/config_types.go
@@ -28,29 +28,43 @@ const (
 )
 
 type BackendRuntimeConfig struct {
-	// Name represents the inference backend under the hood, e.g. vLLM.
+	// BackendName represents the inference backend under the hood, e.g. vLLM.
 	// +kubebuilder:default=vllm
 	// +optional
-	Name *BackendName `json:"name,omitempty"`
+	BackendName *BackendName `json:"backendName,omitempty"`
 	// Version represents the backend version if you want a different one
 	// from the default version.
 	// +optional
 	Version *string `json:"version,omitempty"`
 	// Envs represents the environments set to the container.
 	// +optional
 	Envs []corev1.EnvVar `json:"envs,omitempty"`
-
+	// ConfigName represents the recommended configuration name for the backend,
+	// It will be inferred from the models in the runtime if not specified, e.g. default,
+	// speculative-decoding or model-parallelism.
+	ConfigName *string `json:"configName,omitempty"`
+	// Args represents all the arguments for the command.
+	// Argument around with {{ .CONFIG }} is a configuration waiting for render.
+	// +optional
+	// Args defined here will "append" the args in the recommendedConfig.
+	// +optional
+	Args []string `json:"args,omitempty"`
 	// Resources represents the resource requirements for backend, like cpu/mem,
 	// accelerators like GPU should not be defined here, but at the model flavors,
 	// or the values here will be overwritten.
+	// Resources defined here will "overwrite" the resources in the recommendedConfig.
+	// +optional
 	Resources *ResourceRequirements `json:"resources,omitempty"`
 	// SharedMemorySize represents the size of /dev/shm required in the runtime of
 	// inference workload.
+	// SharedMemorySize defined here will "overwrite" the sharedMemorySize in the recommendedConfig.
 	// +optional
 	SharedMemorySize *resource.Quantity `json:"sharedMemorySize,omitempty"`
-	// Args represents the specified arguments of the backendRuntime,
-	// will be append to the backendRuntime.spec.Args.
-	Args *BackendRuntimeArg `json:"args,omitempty"`
+	// ScaleTrigger defines the rules to scale the workloads.
+	// Only one trigger cloud work at a time, mostly used in Playground.
+	// ScaleTrigger defined here will "overwrite" the scaleTrigger in the recommendedConfig.
+	// +optional
+	ScaleTrigger *ScaleTrigger `json:"scaleTrigger,omitempty"`
 }
 
 // TODO: Do not support DRA yet, we can support that once needed.
@@ -66,33 +80,3 @@ type ResourceRequirements struct {
 	// +optional
 	Requests corev1.ResourceList `json:"requests,omitempty"`
 }
-
-// ScaleTriggerRef refers to the configured scaleTrigger in the backendRuntime.
-type ScaleTriggerRef struct {
-	// Name represents the scale trigger name defined in the backendRuntime.scaleTriggers.
-	Name string `json:"name"`
-}
-
-type ElasticConfig struct {
-	// MinReplicas indicates the minimum number of inference workloads based on the traffic.
-	// Default to 1.
-	// MinReplicas couldn't be 0 now, will support serverless in the future.
-	// +kubebuilder:default=1
-	// +optional
-	MinReplicas *int32 `json:"minReplicas,omitempty"`
-	// MaxReplicas indicates the maximum number of inference workloads based on the traffic.
-	// Default to nil means there's no limit for the instance number.
-	// +optional
-	MaxReplicas *int32 `json:"maxReplicas,omitempty"`
-	// ScaleTriggerRef refers to the configured scaleTrigger in the backendRuntime
-	// with tuned target value.
-	// ScaleTriggerRef and ScaleTrigger can't be set at the same time.
-	// +optional
-	ScaleTriggerRef *ScaleTriggerRef `json:"scaleTriggerRef,omitempty"`
-	// ScaleTrigger defines a set of triggers to scale the workloads.
-	// If not defined, trigger configured in backendRuntime will be used,
-	// otherwise, trigger defined here will overwrite the defaulted ones.
-	// ScaleTriggerRef and ScaleTrigger can't be set at the same time.
-	// +optional
-	ScaleTrigger *ScaleTrigger `json:"scaleTrigger,omitempty"`
-}
diff --git a/api/inference/v1alpha1/playground_types.go b/api/inference/v1alpha1/playground_types.go
@@ -44,11 +44,22 @@ type PlaygroundSpec struct {
 	BackendRuntimeConfig *BackendRuntimeConfig `json:"backendRuntimeConfig,omitempty"`
 	// ElasticConfig defines the configuration for elastic usage,
 	// e.g. the max/min replicas.
-	// Note: this requires to install the HPA first or will report error.
-	// +optional
 	ElasticConfig *ElasticConfig `json:"elasticConfig,omitempty"`
 }
 
+type ElasticConfig struct {
+	// MinReplicas indicates the minimum number of inference workloads based on the traffic.
+	// Default to 1.
+	// MinReplicas couldn't be 0 now, will support serverless in the future.
+	// +kubebuilder:default=1
+	// +optional
+	MinReplicas *int32 `json:"minReplicas,omitempty"`
+	// MaxReplicas indicates the maximum number of inference workloads based on the traffic.
+	// Default to nil means there's no limit for the instance number.
+	// +optional
+	MaxReplicas *int32 `json:"maxReplicas,omitempty"`
+}
+
 const (
 	// PlaygroundProgressing means the Playground is progressing now, such as waiting for the
 	// inference service creation, rolling update or scaling up and down.