#46 Defined latency measurement config

EinStack · Jan 14, 2024 · dee8bbb · dee8bbb
1 parent 36b1fad
commit dee8bbb
Show file tree

Hide file tree

Showing 6 changed files with 88 additions and 10 deletions.
diff --git a/docs/docs.go b/docs/docs.go
@@ -129,7 +129,7 @@ const docTemplate = `{
             "type": "object",
             "properties": {
                 "timeout": {
-                    "type": "integer"
+                    "type": "string"
                 }
             }
         },
@@ -160,6 +160,23 @@ const docTemplate = `{
                 }
             }
         },
+        "latency.Config": {
+            "type": "object",
+            "properties": {
+                "decay": {
+                    "description": "Weight of new latency measurements",
+                    "type": "number"
+                },
+                "update_interval": {
+                    "description": "How often gateway should probe models with not the lowest response latency",
+                    "type": "string"
+                },
+                "warmup_samples": {
+                    "description": "The number of latency probes required to init moving average",
+                    "type": "integer"
+                }
+            }
+        },
         "openai.Config": {
             "type": "object",
             "required": [
@@ -253,6 +270,9 @@ const docTemplate = `{
                     "description": "Model instance ID (unique in scope of the router)",
                     "type": "string"
                 },
+                "latency": {
+                    "$ref": "#/definitions/latency.Config"
+                },
                 "openai": {
                     "$ref": "#/definitions/openai.Config"
                 }
@@ -318,13 +338,13 @@ const docTemplate = `{
         "routing.Strategy": {
             "type": "string",
             "enum": [
-                "least_latency",
                 "priority",
+                "least_latency",
                 "round-robin"
             ],
             "x-enum-varnames": [
-                "LeastLatency",
                 "Priority",
+                "LeastLatency",
                 "RoundRobin"
             ]
         },

diff --git a/docs/swagger.json b/docs/swagger.json
@@ -126,7 +126,7 @@
             "type": "object",
             "properties": {
                 "timeout": {
-                    "type": "integer"
+                    "type": "string"
                 }
             }
         },
@@ -157,6 +157,23 @@
                 }
             }
         },
+        "latency.Config": {
+            "type": "object",
+            "properties": {
+                "decay": {
+                    "description": "Weight of new latency measurements",
+                    "type": "number"
+                },
+                "update_interval": {
+                    "description": "How often gateway should probe models with not the lowest response latency",
+                    "type": "string"
+                },
+                "warmup_samples": {
+                    "description": "The number of latency probes required to init moving average",
+                    "type": "integer"
+                }
+            }
+        },
         "openai.Config": {
             "type": "object",
             "required": [
@@ -250,6 +267,9 @@
                     "description": "Model instance ID (unique in scope of the router)",
                     "type": "string"
                 },
+                "latency": {
+                    "$ref": "#/definitions/latency.Config"
+                },
                 "openai": {
                     "$ref": "#/definitions/openai.Config"
                 }
@@ -315,13 +335,13 @@
         "routing.Strategy": {
             "type": "string",
             "enum": [
-                "least_latency",
                 "priority",
+                "least_latency",
                 "round-robin"
             ],
             "x-enum-varnames": [
-                "LeastLatency",
                 "Priority",
+                "LeastLatency",
                 "RoundRobin"
             ]
         },

diff --git a/docs/swagger.yaml b/docs/swagger.yaml
@@ -3,7 +3,7 @@ definitions:
   clients.ClientConfig:
     properties:
       timeout:
-        type: integer
+        type: string
     type: object
   http.ErrorSchema:
     properties:
@@ -22,6 +22,19 @@ definitions:
           $ref: '#/definitions/routers.LangRouterConfig'
         type: array
     type: object
+  latency.Config:
+    properties:
+      decay:
+        description: Weight of new latency measurements
+        type: number
+      update_interval:
+        description: How often gateway should probe models with not the lowest response
+          latency
+        type: string
+      warmup_samples:
+        description: The number of latency probes required to init moving average
+        type: integer
+    type: object
   openai.Config:
     properties:
       baseUrl:
@@ -83,6 +96,8 @@ definitions:
       id:
         description: Model instance ID (unique in scope of the router)
         type: string
+      latency:
+        $ref: '#/definitions/latency.Config'
       openai:
         $ref: '#/definitions/openai.Config'
     required:
@@ -126,13 +141,13 @@ definitions:
     type: object
   routing.Strategy:
     enum:
-    - least_latency
     - priority
+    - least_latency
     - round-robin
     type: string
     x-enum-varnames:
-    - LeastLatency
     - Priority
+    - LeastLatency
     - RoundRobin
   schemas.ChatMessage:
     properties:

diff --git a/pkg/providers/clients/config.go b/pkg/providers/clients/config.go
@@ -3,7 +3,7 @@ package clients
 import "time"
 
 type ClientConfig struct {
-	Timeout *time.Duration `yaml:"timeout,omitempty" json:"timeout" swaggertype:"primitive,integer"`
+	Timeout *time.Duration `yaml:"timeout,omitempty" json:"timeout" swaggertype:"primitive,string"`
 }
 
 func DefaultClientConfig() *ClientConfig {

diff --git a/pkg/providers/config.go b/pkg/providers/config.go
@@ -3,6 +3,7 @@ package providers
 import (
 	"errors"
 	"fmt"
+	"glide/pkg/routers/latency"
 
 	"glide/pkg/providers/clients"
 
@@ -18,6 +19,7 @@ type LangModelConfig struct {
 	ID          string                `yaml:"id" json:"id" validate:"required"` // Model instance ID (unique in scope of the router)
 	Enabled     bool                  `yaml:"enabled" json:"enabled"`           // Is the model enabled?
 	ErrorBudget health.ErrorBudget    `yaml:"error_budget" json:"error_budget" swaggertype:"primitive,string"`
+	Latency     *latency.Config       `yaml:"latency" json:"latency"`
 	Client      *clients.ClientConfig `yaml:"client" json:"client"`
 	OpenAI      *openai.Config        `yaml:"openai" json:"openai"`
 	// Add other providers like
@@ -30,6 +32,7 @@ func DefaultLangModelConfig() *LangModelConfig {
 		Enabled:     true,
 		Client:      clients.DefaultClientConfig(),
 		ErrorBudget: health.DefaultErrorBudget(),
+		Latency:     latency.DefaultConfig(),
 	}
 }
 

diff --git a/pkg/routers/latency/config.go b/pkg/routers/latency/config.go
@@ -0,0 +1,20 @@
+package latency
+
+import "time"
+
+// Config defines setting for moving average latency calculations
+type Config struct {
+	Decay          float32        `yaml:"decay" json:"decay"`                                                              // Weight of new latency measurements
+	WarmupSamples  int            `yaml:"warmup_samples" json:"warmup_samples"`                                            // The number of latency probes required to init moving average
+	UpdateInterval *time.Duration `yaml:"update_interval,omitempty" json:"update_interval" swaggertype:"primitive,string"` // How often gateway should probe models with not the lowest response latency
+}
+
+func DefaultConfig() *Config {
+	defaultUpdateInterval := 30 * time.Second
+
+	return &Config{
+		Decay:          0.06,
+		WarmupSamples:  3,
+		UpdateInterval: &defaultUpdateInterval,
+	}
+}