diff --git a/controllers/constants/constants.go b/controllers/constants/constants.go index dee82832..728ebd76 100644 --- a/controllers/constants/constants.go +++ b/controllers/constants/constants.go @@ -70,6 +70,7 @@ const ( DefaultStorageConfig = "storage-config" IntervalValue = "1m" RequestRateInterval = "5m" + GPUKVCacheSamplingInterval = "24h" OvmsImageName = "openvino_model_server" TgisImageName = "text-generation-inference" VllmImageName = "vllm" diff --git a/controllers/constants/runtime-metrics.go b/controllers/constants/runtime-metrics.go index 0ab23eaf..7dffd17a 100644 --- a/controllers/constants/runtime-metrics.go +++ b/controllers/constants/runtime-metrics.go @@ -247,7 +247,7 @@ const ( "queries": [ { "title": "Average e2e latency", - "query": "sum by (model_name) (rate(e2e_request_latency_seconds_sum{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${RATE_INTERVAL}]) * 1000) / sum by (model_name) (rate(e2e_request_latency_seconds_count{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${RATE_INTERVAL}]) * 1000)" + "query": "(rate(e2e_request_latency_seconds_sum{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${RATE_INTERVAL}]) * 1000) / (rate(e2e_request_latency_seconds_count{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${RATE_INTERVAL}]) * 1000)" } ] }, @@ -277,7 +277,7 @@ const ( "queries": [ { "title": "GPU cache usage over time", - "query": "round(sum(increase(gpu_cache_usage_perc{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${RATE_INTERVAL}])))" + "query": "sum_over_time(gpu_cache_usage_perc{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${KV_CACHE_SAMPLING_RATE}])" } ] }, @@ -305,11 +305,11 @@ const ( "queries": [ { "title": "Total prompts token", - "query": "round(sum(increase(prompt_tokens_total{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${RATE_INTERVAL}])))" + "query": "round(rate(prompt_tokens_total{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${RATE_INTERVAL}]))" }, { "title": "Total generation token", - "query": "round(sum(increase(generation_tokens_total{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${RATE_INTERVAL}])))" + "query": "round(rate(generation_tokens_total{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${RATE_INTERVAL}]))" } ] }, @@ -319,7 +319,7 @@ const ( "queries": [ { "title": "Time to first token", - "query": "rate(time_to_first_token_seconds_bucket{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${RATE_INTERVAL}])" + "query": "rate(time_to_first_token_seconds_sum{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${RATE_INTERVAL}])" } ] }, @@ -329,7 +329,7 @@ const ( "queries": [ { "title": "Time per output token", - "query": "rate(time_per_output_token_seconds_bucket{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${RATE_INTERVAL}])" + "query": "rate(time_per_output_token_seconds_sum{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${RATE_INTERVAL}])" } ] }, @@ -339,11 +339,11 @@ const ( "queries": [ { "title": "Number of successful incoming requests", - "query": "request_success_total{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}" + "query": "round(sum(increase(request_success_total{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${REQUEST_RATE_INTERVAL}])))" }, { "title": "Number of failed incoming requests", - "query": "request_failure_total{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}" + "query": "round(sum(increase(request_failure_total{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${REQUEST_RATE_INTERVAL}])))" } ] } diff --git a/controllers/utils/utils.go b/controllers/utils/utils.go index 7fde9682..4aaaabe5 100644 --- a/controllers/utils/utils.go +++ b/controllers/utils/utils.go @@ -427,6 +427,7 @@ func SubstituteVariablesInQueries(data string, namespace string, name string) st "${NAMESPACE}", namespace, "${MODEL_NAME}", name, "${RATE_INTERVAL}", constants.IntervalValue, - "${REQUEST_RATE_INTERVAL}", constants.RequestRateInterval) + "${REQUEST_RATE_INTERVAL}", constants.RequestRateInterval, + "${KV_CACHE_SAMPLING_RATE}", constants.GPUKVCacheSamplingInterval) return replacer.Replace(data) }