From aee3e05816baa3384d6677c8694b446fc66aba1a Mon Sep 17 00:00:00 2001
From: Tomer Figenblat <tomer.figenblat@gmail.com>
Date: Mon, 2 Dec 2024 01:21:19 -0500
Subject: [PATCH] feat: added performance metric grpahs config for nvidia nim
 (#320)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat: added performance metric grpahs config for nvidia nim

Signed-off-by: Tomer Figenblat <tfigenbl@redhat.com>

* chore: modifyed the runtime id annotation

Co-authored-by: Edgar Hernández <ehernand@redhat.com>
Signed-off-by: Tomer Figenblat <tfigenbl@redhat.com>

---------

Signed-off-by: Tomer Figenblat <tfigenbl@redhat.com>
Co-authored-by: Edgar Hernández <ehernand@redhat.com>
---
 controllers/constants/runtime-metrics.go      | 50 +++++++++++++++++++
 .../kserve_metrics_dashboard_reconciler.go    | 46 ++++++++++-------
 controllers/utils/nim.go                      |  6 +++
 3 files changed, 84 insertions(+), 18 deletions(-)

diff --git a/controllers/constants/runtime-metrics.go b/controllers/constants/runtime-metrics.go
index 7ca95b37..b09b2572 100644
--- a/controllers/constants/runtime-metrics.go
+++ b/controllers/constants/runtime-metrics.go
@@ -223,4 +223,54 @@ const (
 			}
 		]
     }`
+
+	// NVIDIA NIM
+	NIMMetricsData = `{
+        "config": [
+			{
+				"title": "Requests per 5 minutes",
+				"type": "REQUEST_COUNT",
+				"queries": [
+					{
+						"title": "Number of successful incoming requests",
+						"query": "round(sum(increase(request_success_total{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${REQUEST_RATE_INTERVAL}])))"
+					},
+					{
+						"title": "Number of failed incoming requests",
+						"query": "round(sum(increase(request_failure_total{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${REQUEST_RATE_INTERVAL}])))"
+					}
+				]
+			},
+{
+				"title": "Average response time (ms)",
+				"type": "MEAN_LATENCY",
+				"queries": [
+					{
+						"title": "Average e2e latency",
+						"query": "sum by (model_name) (rate(e2e_request_latency_seconds_sum{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${RATE_INTERVAL}]) * 1000) / sum by (model_name) (rate(e2e_request_latency_seconds_count{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'}[${RATE_INTERVAL}]) * 1000)"
+					}
+				]
+			},
+			{
+				"title": "CPU utilization %",
+				"type": "CPU_USAGE",
+				"queries": [
+					{
+						"title": "CPU usage",
+						"query":  "sum(pod:container_cpu_usage:sum{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'})/sum(kube_pod_resource_limit{resource='cpu', pod=~'${MODEL_NAME}-predictor-.*', namespace='${NAMESPACE}'})"
+					}
+				]
+			},
+			{
+				"title": "Memory utilization %",
+				"type": "MEMORY_USAGE",
+				"queries": [
+					{
+						"title": "Memory usage",
+						"query":  "sum(container_memory_working_set_bytes{namespace='${NAMESPACE}', pod=~'${MODEL_NAME}-predictor-.*'})/sum(kube_pod_resource_limit{resource='memory', pod=~'${MODEL_NAME}-predictor-.*', namespace='${NAMESPACE}'})"
+					}
+				]
+			}
+		]
+    }`
 )
diff --git a/controllers/reconcilers/kserve_metrics_dashboard_reconciler.go b/controllers/reconcilers/kserve_metrics_dashboard_reconciler.go
index e07c243c..bf3d3ead 100644
--- a/controllers/reconcilers/kserve_metrics_dashboard_reconciler.go
+++ b/controllers/reconcilers/kserve_metrics_dashboard_reconciler.go
@@ -92,7 +92,6 @@ func (r *KserveMetricsDashboardReconciler) Reconcile(ctx context.Context, log lo
 func (r *KserveMetricsDashboardReconciler) createDesiredResource(ctx context.Context, log logr.Logger, isvc *kservev1beta1.InferenceService) (*corev1.ConfigMap, error) {
 
 	var err error
-	var servingRuntime string
 	runtime := &kservev1alpha1.ServingRuntime{}
 	supported := false
 
@@ -128,24 +127,8 @@ func (r *KserveMetricsDashboardReconciler) createDesiredResource(ctx context.Con
 		supported = false
 	}
 
-	servingRuntimeImage := runtime.Spec.Containers[0].Image
-	re := regexp.MustCompile(`/([^/@]+)[@:]`)
-	findImageName := re.FindStringSubmatch(servingRuntimeImage)
-	// sanity check for regex match, will fall back to a known string that will lead to a configmap for unsupported metrics
-	if len(findImageName) < 2 {
-		servingRuntime = constants.ServingRuntimeFallBackImageName
-	} else {
-		servingRuntime = findImageName[1]
-	}
-
-	runtimeMetricsData := map[string]string{
-		constants.OvmsImageName:   constants.OvmsMetricsData,
-		constants.TgisImageName:   constants.TgisMetricsData,
-		constants.VllmImageName:   constants.VllmMetricsData,
-		constants.CaikitImageName: constants.CaikitMetricsData,
-	}
 	// supported is true only when a match on this map is found, is false otherwise
-	data, supported := runtimeMetricsData[servingRuntime]
+	data, supported := getMetricsData(runtime)
 	configMap, err := r.createConfigMap(isvc, supported, log)
 	if err != nil {
 		return nil, err
@@ -220,3 +203,30 @@ func (r *KserveMetricsDashboardReconciler) processDelta(ctx context.Context, log
 	}
 	return nil
 }
+
+func getMetricsData(runtime *kservev1alpha1.ServingRuntime) (string, bool) {
+	if runtime.Annotations[utils.IsNimRuntimeAnnotation] == "true" {
+		return constants.NIMMetricsData, true
+	}
+
+	var servingRuntime string
+	servingRuntimeImage := runtime.Spec.Containers[0].Image
+	re := regexp.MustCompile(`/([^/@]+)[@:]`)
+	findImageName := re.FindStringSubmatch(servingRuntimeImage)
+	// sanity check for regex match, will fall back to a known string that will lead to a configmap for unsupported metrics
+	if len(findImageName) < 2 {
+		servingRuntime = constants.ServingRuntimeFallBackImageName
+	} else {
+		servingRuntime = findImageName[1]
+	}
+
+	runtimeMetricsData := map[string]string{
+		constants.OvmsImageName:   constants.OvmsMetricsData,
+		constants.TgisImageName:   constants.TgisMetricsData,
+		constants.VllmImageName:   constants.VllmMetricsData,
+		constants.CaikitImageName: constants.CaikitMetricsData,
+	}
+	// supported is true only when a match on this map is found, is false otherwise
+	data, supported := runtimeMetricsData[servingRuntime]
+	return data, supported
+}
diff --git a/controllers/utils/nim.go b/controllers/utils/nim.go
index 07e776af..4f8e2c00 100644
--- a/controllers/utils/nim.go
+++ b/controllers/utils/nim.go
@@ -91,6 +91,7 @@ const (
 	nimGetNgcCatalog         = "https://api.ngc.nvidia.com/v2/search/catalog/resources/CONTAINER"
 	nimGetNgcToken           = "https://authn.nvidia.com/token?service=ngc&"
 	nimGetNgcModelDataFmt    = "https://api.ngc.nvidia.com/v2/org/%s/team/%s/repos/%s?resolve-labels=true"
+	IsNimRuntimeAnnotation   = "runtimes.opendatahub.io/nvidia-nim"
 )
 
 var NimHttpClient HttpClient
@@ -296,6 +297,7 @@ func GetNimServingRuntimeTemplate(scheme *runtime.Scheme) (*v1alpha1.ServingRunt
 			Annotations: map[string]string{
 				"opendatahub.io/recommended-accelerators": "[\"nvidia.com/gpu\"]",
 				"openshift.io/display-name":               "NVIDIA NIM",
+				IsNimRuntimeAnnotation:                    "true",
 			},
 			Labels: map[string]string{
 				"opendatahub.io/dashboard": "true",
@@ -304,6 +306,10 @@ func GetNimServingRuntimeTemplate(scheme *runtime.Scheme) (*v1alpha1.ServingRunt
 		},
 		Spec: v1alpha1.ServingRuntimeSpec{
 			ServingRuntimePodSpec: v1alpha1.ServingRuntimePodSpec{
+				Annotations: map[string]string{
+					"prometheus.io/path": "/metrics",
+					"prometheus.io/port": "8000",
+				},
 				Containers: []corev1.Container{
 					{Env: []corev1.EnvVar{
 						{