From 223b0884a121a8e229d409a74dd32d1daf4db549 Mon Sep 17 00:00:00 2001 From: Alan Guo Date: Tue, 10 Dec 2024 21:08:31 -0800 Subject: [PATCH] [Dashboard] Add instance variables to many default dashboard graphs (#49174) Signed-off-by: Alan Guo --- .../dashboards/default_dashboard_panels.py | 72 +++++++++---------- .../default_grafana_dashboard_base.json | 12 ++-- 2 files changed, 40 insertions(+), 44 deletions(-) diff --git a/python/ray/dashboard/modules/metrics/dashboards/default_dashboard_panels.py b/python/ray/dashboard/modules/metrics/dashboards/default_dashboard_panels.py index f0d6f4916229..a6c834a33dcd 100644 --- a/python/ray/dashboard/modules/metrics/dashboards/default_dashboard_panels.py +++ b/python/ray/dashboard/modules/metrics/dashboards/default_dashboard_panels.py @@ -38,11 +38,11 @@ def max_plus_pending(max_resource, pending_resource): unit="tasks", targets=[ Target( - expr='sum(max_over_time(ray_tasks{{IsRetry="0",State=~"FINISHED|FAILED",{global_filters}}}[14d])) by (State) or clamp_min(sum(ray_tasks{{IsRetry="0",State!~"FINISHED|FAILED",{global_filters}}}) by (State), 0)', + expr='sum(max_over_time(ray_tasks{{IsRetry="0",State=~"FINISHED|FAILED",instance=~"$Instance",{global_filters}}}[14d])) by (State) or clamp_min(sum(ray_tasks{{IsRetry="0",State!~"FINISHED|FAILED",instance=~"$Instance",{global_filters}}}) by (State), 0)', legend="{{State}}", ), Target( - expr='sum(max_over_time(ray_tasks{{IsRetry!="0",State=~"FINISHED|FAILED",{global_filters}}}[14d])) by (State) or clamp_min(sum(ray_tasks{{IsRetry!="0",State!~"FINISHED|FAILED",{global_filters}}}) by (State), 0)', + expr='sum(max_over_time(ray_tasks{{IsRetry!="0",State=~"FINISHED|FAILED",instance=~"$Instance",{global_filters}}}[14d])) by (State) or clamp_min(sum(ray_tasks{{IsRetry!="0",State!~"FINISHED|FAILED",instance=~"$Instance",{global_filters}}}) by (State), 0)', legend="{{State}} (retry)", ), ], @@ -56,11 +56,11 @@ def max_plus_pending(max_resource, pending_resource): unit="tasks", targets=[ Target( - expr='sum(ray_tasks{{IsRetry="0",State!~"FINISHED|FAILED",{global_filters}}}) by (Name)', + expr='clamp_min(sum(ray_tasks{{IsRetry="0",State!~"FINISHED|FAILED",instance=~"$Instance",{global_filters}}}) by (Name), 0)', legend="{{Name}}", ), Target( - expr='sum(ray_tasks{{IsRetry!="0",State!~"FINISHED|FAILED",{global_filters}}}) by (Name)', + expr='clamp_min(sum(ray_tasks{{IsRetry!="0",State!~"FINISHED|FAILED",instance=~"$Instance",{global_filters}}}) by (Name), 0)', legend="{{Name}} (retry)", ), ], @@ -74,11 +74,11 @@ def max_plus_pending(max_resource, pending_resource): unit="tasks", targets=[ Target( - expr='sum(ray_tasks{{IsRetry="0",State=~"RUNNING*",{global_filters}}}) by (Name)', + expr='clamp_min(sum(ray_tasks{{IsRetry="0",State=~"RUNNING*",instance=~"$Instance",{global_filters}}}) by (Name), 0)', legend="{{Name}}", ), Target( - expr='sum(ray_tasks{{IsRetry!="0",State=~"RUNNING*",{global_filters}}}) by (Name)', + expr='clamp_min(sum(ray_tasks{{IsRetry!="0",State=~"RUNNING*",instance=~"$Instance",{global_filters}}}) by (Name), 0)', legend="{{Name}} (retry)", ), ], @@ -88,7 +88,7 @@ def max_plus_pending(max_resource, pending_resource): Panel( id=33, title="Scheduler Actor State", - description="Current number of actors in a particular state.\n\nState: the actor state, as described by rpc::ActorTableData proto in gcs.proto.", + description='Note: not impacted by "Instance" variable.\n\nCurrent number of actors in a particular state.\n\nState: the actor state, as described by rpc::ActorTableData proto in gcs.proto.', unit="actors", targets=[ Target( @@ -99,24 +99,24 @@ def max_plus_pending(max_resource, pending_resource): ), Panel( id=42, - title="Alive Actor State", + title="Live Actor State", description="Current number of alive actors in a particular state.\n\nState: IDLE, RUNNING_TASK, RUNNING_IN_RAY_GET, RUNNING_IN_RAY_WAIT", unit="actors", targets=[ Target( - expr='sum(ray_actors{{Source="executor",{global_filters}}}) by (State)', + expr='sum(ray_actors{{Source="executor",NodeAddress=~"$Instance",{global_filters}}}) by (State)', legend="{{State}}", ) ], ), Panel( id=36, - title="Requested Live Actors by Name", - description="Current number of (live) actors with a particular name.", + title="Live Actors by Name", + description="Current number of alive actors with a particular name.", unit="actors", targets=[ Target( - expr='sum(ray_actors{{State!="DEAD",Source="gcs",{global_filters}}}) by (Name)', + expr='sum(ray_actors{{State!="DEAD",Source="executor",NodeAddress=~"$Instance",{global_filters}}}) by (Name)', legend="{{Name}}", ) ], @@ -128,11 +128,11 @@ def max_plus_pending(max_resource, pending_resource): unit="cores", targets=[ Target( - expr='sum(ray_resources{{Name="CPU",State="USED",{global_filters}}}) by (instance)', + expr='sum(ray_resources{{Name="CPU",State="USED",instance=~"$Instance",{global_filters}}}) by (instance)', legend="CPU Usage: {{instance}}", ), Target( - expr='sum(ray_resources{{Name="CPU",{global_filters}}})', + expr='sum(ray_resources{{Name="CPU",instance=~"$Instance",{global_filters}}})', legend="MAX", ), # If max + pending > max, we display this value. @@ -150,11 +150,11 @@ def max_plus_pending(max_resource, pending_resource): unit="bytes", targets=[ Target( - expr="sum(ray_object_store_memory{{{global_filters}}}) by (Location)", + expr='sum(ray_object_store_memory{{instance=~"$Instance",{global_filters}}}) by (Location)', legend="{{Location}}", ), Target( - expr='sum(ray_resources{{Name="object_store_memory",{global_filters}}})', + expr='sum(ray_resources{{Name="object_store_memory",instance=~"$Instance",{global_filters}}})', legend="MAX", ), ], @@ -166,11 +166,11 @@ def max_plus_pending(max_resource, pending_resource): unit="GPUs", targets=[ Target( - expr='ray_resources{{Name="GPU",State="USED",{global_filters}}}', + expr='ray_resources{{Name="GPU",State="USED",instance=~"$Instance",{global_filters}}}', legend="GPU Usage: {{instance}}", ), Target( - expr='sum(ray_resources{{Name="GPU",{global_filters}}})', + expr='sum(ray_resources{{Name="GPU",instance=~"$Instance",{global_filters}}})', legend="MAX", ), # If max + pending > max, we display this value. @@ -184,7 +184,7 @@ def max_plus_pending(max_resource, pending_resource): Panel( id=40, title="Scheduler Placement Groups", - description="Current number of placement groups in a particular state.\n\nState: the placement group state, as described by the rpc::PlacementGroupTable proto in gcs.proto.", + description='Note: not impacted by "Instance" variable.\n\nCurrent number of placement groups in a particular state.\n\nState: the placement group state, as described by the rpc::PlacementGroupTable proto in gcs.proto.', unit="placement groups", targets=[ Target( @@ -208,7 +208,7 @@ def max_plus_pending(max_resource, pending_resource): legend="CPU Usage: {{instance}} (head)", ), Target( - expr="sum(ray_node_cpu_count{{{global_filters}}})", + expr='sum(ray_node_cpu_count{{instance=~"$Instance",{global_filters}}})', legend="MAX", ), ], @@ -228,7 +228,7 @@ def max_plus_pending(max_resource, pending_resource): legend="GPU Usage: {{instance}} (head), gpu.{{GpuIndex}}, {{GpuDeviceName}}", ), Target( - expr="sum(ray_node_gpus_available{{{global_filters}}})", + expr='sum(ray_node_gpus_available{{instance=~"$Instance",{global_filters}}})', legend="MAX", ), ], @@ -248,7 +248,7 @@ def max_plus_pending(max_resource, pending_resource): legend="Disk Used: {{instance}} (head)", ), Target( - expr="sum(ray_node_disk_free{{{global_filters}}}) + sum(ray_node_disk_usage{{{global_filters}}})", + expr='sum(ray_node_disk_free{{instance=~"$Instance",{global_filters}}}) + sum(ray_node_disk_usage{{instance=~"$Instance",{global_filters}}})', legend="MAX", ), ], @@ -292,7 +292,7 @@ def max_plus_pending(max_resource, pending_resource): legend="Memory Used: {{instance}} (head)", ), Target( - expr="sum(ray_node_mem_total{{{global_filters}}})", + expr='sum(ray_node_mem_total{{instance=~"$Instance",{global_filters}}})', legend="MAX", ), ], @@ -334,15 +334,15 @@ def max_plus_pending(max_resource, pending_resource): unit="bytes", targets=[ Target( - expr="(sum(ray_component_rss_mb{{{global_filters}}} * 1e6) by (Component)) - (sum(ray_component_mem_shared_bytes{{{global_filters}}}) by (Component))", + expr='(sum(ray_component_rss_mb{{instance=~"$Instance",{global_filters}}} * 1e6) by (Component)) - (sum(ray_component_mem_shared_bytes{{instance=~"$Instance",{global_filters}}}) by (Component))', legend="{{Component}}", ), Target( - expr="sum(ray_node_mem_shared_bytes{{{global_filters}}})", + expr='sum(ray_node_mem_shared_bytes{{instance=~"$Instance",{global_filters}}})', legend="shared_memory", ), Target( - expr="sum(ray_node_mem_total{{{global_filters}}})", + expr='sum(ray_node_mem_total{{instance=~"$Instance",{global_filters}}})', legend="MAX", ), ], @@ -355,11 +355,11 @@ def max_plus_pending(max_resource, pending_resource): targets=[ Target( # ray_component_cpu_percentage returns a percentage that can be > 100. It means that it uses more than 1 CPU. - expr="sum(ray_component_cpu_percentage{{{global_filters}}}) by (Component) / 100", + expr='sum(ray_component_cpu_percentage{{instance=~"$Instance",{global_filters}}}) by (Component) / 100', legend="{{Component}}", ), Target( - expr="sum(ray_node_cpu_count{{{global_filters}}})", + expr='sum(ray_node_cpu_count{{instance=~"$Instance",{global_filters}}})', legend="MAX", ), ], @@ -375,7 +375,7 @@ def max_plus_pending(max_resource, pending_resource): legend="Used GRAM: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}", ), Target( - expr="(sum(ray_node_gram_available{{{global_filters}}}) + sum(ray_node_gram_used{{{global_filters}}})) * 1024 * 1024", + expr='(sum(ray_node_gram_available{{instance=~"$Instance",{global_filters}}}) + sum(ray_node_gram_used{{instance=~"$Instance",{global_filters}}})) * 1024 * 1024', legend="MAX", ), ], @@ -399,7 +399,7 @@ def max_plus_pending(max_resource, pending_resource): Panel( id=24, title="Node Count", - description="A total number of active failed, and pending nodes from the cluster. \n\nACTIVE: A node is alive and available.\n\nFAILED: A node is dead and not available. The node is considered dead when the raylet process on the node is terminated. The node will get into the failed state if it cannot be provided (e.g., there's no available node from the cloud provider) or failed to setup (e.g., setup_commands have errors). \n\nPending: A node is being started by the Ray cluster launcher. The node is unavailable now because it is being provisioned and initialized.", + description='Note: not impacted by "Instance" variable.\n\nA total number of active failed, and pending nodes from the cluster. \n\nACTIVE: A node is alive and available.\n\nFAILED: A node is dead and not available. The node is considered dead when the raylet process on the node is terminated. The node will get into the failed state if it cannot be provided (e.g., there\'s no available node from the cloud provider) or failed to setup (e.g., setup_commands have errors). \n\nPending: A node is being started by the Ray cluster launcher. The node is unavailable now because it is being provisioned and initialized.', unit="nodes", targets=[ Target( @@ -424,32 +424,32 @@ def max_plus_pending(max_resource, pending_resource): targets=[ # CPU Target( - expr="avg(ray_node_cpu_utilization{{{global_filters}}})", + expr='avg(ray_node_cpu_utilization{{instance=~"$Instance",{global_filters}}})', legend="CPU (physical)", ), # GPU Target( - expr="sum(ray_node_gpus_utilization{{{global_filters}}}) / on() (sum(autoscaler_cluster_resources{{resource='GPU',{global_filters}}}) or vector(0))", + expr='sum(ray_node_gpus_utilization{{instance=~"$Instance",{global_filters}}}) / on() (sum(autoscaler_cluster_resources{{resource="GPU",instance=~"$Instance",{global_filters}}}) or vector(0))', legend="GPU (physical)", ), # Memory Target( - expr="sum(ray_node_mem_used{{{global_filters}}}) / on() (sum(ray_node_mem_total{{{global_filters}}})) * 100", + expr='sum(ray_node_mem_used{{instance=~"$Instance",{global_filters}}}) / on() (sum(ray_node_mem_total{{instance=~"$Instance",{global_filters}}})) * 100', legend="Memory (RAM)", ), # GRAM Target( - expr="sum(ray_node_gram_used{{{global_filters}}}) / on() (sum(ray_node_gram_available{{{global_filters}}}) + sum(ray_node_gram_used{{{global_filters}}})) * 100", + expr='sum(ray_node_gram_used{{instance=~"$Instance",{global_filters}}}) / on() (sum(ray_node_gram_available{{instance=~"$Instance",{global_filters}}}) + sum(ray_node_gram_used{{instance=~"$Instance",{global_filters}}})) * 100', legend="GRAM", ), # Object Store Target( - expr='sum(ray_object_store_memory{{{global_filters}}}) / on() sum(ray_resources{{Name="object_store_memory",{global_filters}}}) * 100', + expr='sum(ray_object_store_memory{{instance=~"$Instance",{global_filters}}}) / on() sum(ray_resources{{Name="object_store_memory",instance=~"$Instance",{global_filters}}}) * 100', legend="Object Store Memory", ), # Disk Target( - expr="sum(ray_node_disk_usage{{{global_filters}}}) / on() (sum(ray_node_disk_free{{{global_filters}}}) + sum(ray_node_disk_usage{{{global_filters}}})) * 100", + expr='sum(ray_node_disk_usage{{instance=~"$Instance",{global_filters}}}) / on() (sum(ray_node_disk_free{{instance=~"$Instance",{global_filters}}}) + sum(ray_node_disk_usage{{instance=~"$Instance",{global_filters}}})) * 100', legend="Disk", ), ], diff --git a/python/ray/dashboard/modules/metrics/dashboards/default_grafana_dashboard_base.json b/python/ray/dashboard/modules/metrics/dashboards/default_grafana_dashboard_base.json index 4d61d9359fd5..b7d93ceb902d 100644 --- a/python/ray/dashboard/modules/metrics/dashboards/default_grafana_dashboard_base.json +++ b/python/ray/dashboard/modules/metrics/dashboards/default_grafana_dashboard_base.json @@ -73,15 +73,11 @@ "allValue": ".+", "current": { "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] + "text": ["All"], + "value": ["$__all"] }, "datasource": "${datasource}", - "definition": "label_values(ray_node_network_receive_speed{{SessionName=\"$SessionName\",{global_filters}}}, instance)", + "definition": "label_values(ray_node_network_receive_speed{{SessionName=~\"$SessionName\",{global_filters}}}, instance)", "description": null, "error": null, "hide": 0, @@ -91,7 +87,7 @@ "name": "Instance", "options": [], "query": { - "query": "label_values(ray_node_network_receive_speed{{SessionName=\"$SessionName\",{global_filters}}}, instance)", + "query": "label_values(ray_node_network_receive_speed{{SessionName=~\"$SessionName\",{global_filters}}}, instance)", "refId": "Prometheus-Instance-Variable-Query" }, "refresh": 2,