elastic · tonyghiani · Sep 28, 2023 · Sep 26, 2023 · Sep 26, 2023 · Sep 26, 2023
diff --git a/x-pack/plugins/monitoring/common/types/alerts.ts b/x-pack/plugins/monitoring/common/types/alerts.ts
@@ -171,7 +171,6 @@ export interface AlertNodeStats {
 export interface AlertCpuUsageNodeStats extends AlertNodeStats {
   cpuUsage?: number;
   limitsChanged?: boolean;
-  missingLimits?: boolean;
   unexpectedLimits?: boolean;
 }
 

diff --git a/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.test.ts b/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.test.ts
@@ -258,12 +258,7 @@ describe('CpuUsageRule', () => {
 
     it('should fire actions when resource limits are missing', async () => {
       (fetchCpuUsageNodeStats as jest.Mock).mockImplementation(() => {
-        return [
-          {
-            ...stat,
-            missingLimits: true,
-          },
-        ];
+        return [stat];
       });
 
       const rule = new CpuUsageRule();
@@ -287,14 +282,39 @@ describe('CpuUsageRule', () => {
               nodeId,
               nodeName,
               threshold,
-              missingLimits: true,
             },
             nodeId,
             nodeName,
             ui: {
               isFiring: true,
               message: {
-                text: `Kibana is configured for containerized workloads but node #start_linkmyNodeName#end_link does not have resource limits configured. Fallback metric reports usage of ${cpuUsage}%. Last checked at #absolute`,
+                text: `Node #start_link${nodeName}#end_link is reporting CPU usage of ${cpuUsage}% which is above the configured threshold of ${threshold}%. Last checked at #absolute`,
+                nextSteps: [
+                  {
+                    text: '#start_linkCheck hot threads#end_link',
+                    tokens: [
+                      {
+                        startToken: '#start_link',
+                        endToken: '#end_link',
+                        type: 'docLink',
+                        partialUrl:
+                          '{elasticWebsiteUrl}guide/en/elasticsearch/reference/{docLinkVersion}/cluster-nodes-hot-threads.html',
+                      },
+                    ],
+                  },
+                  {
+                    text: '#start_linkCheck long running tasks#end_link',
+                    tokens: [
+                      {
+                        startToken: '#start_link',
+                        endToken: '#end_link',
+                        type: 'docLink',
+                        partialUrl:
+                          '{elasticWebsiteUrl}guide/en/elasticsearch/reference/{docLinkVersion}/tasks.html',
+                      },
+                    ],
+                  },
+                ],
                 tokens: [
                   {
                     startToken: '#start_link',
@@ -319,8 +339,8 @@ describe('CpuUsageRule', () => {
         ],
       });
       expect(scheduleActions).toHaveBeenCalledWith('default', {
-        internalFullMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`,
-        internalShortMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. Verify CPU usage of node.`,
+        internalFullMessage: `CPU usage alert is firing for node ${nodeName} in cluster ${clusterName}. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`,
+        internalShortMessage: `CPU usage alert is firing for node ${nodeName} in cluster ${clusterName}. Verify CPU usage of node.`,
         action: `[View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`,
         actionPlain: 'Verify CPU usage of node.',
         clusterName,

diff --git a/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.ts b/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.ts
@@ -100,12 +100,7 @@ export class CpuUsageRule extends BaseRule {
     stat: AlertCpuUsageNodeStats,
     threshold: number
   ): { shouldFire: boolean; severity: AlertSeverity } {
-    if (
-      stat.missingLimits ||
-      stat.limitsChanged ||
-      stat.unexpectedLimits ||
-      stat.cpuUsage === undefined
-    ) {
+    if (stat.limitsChanged || stat.unexpectedLimits || stat.cpuUsage === undefined) {
       let severity = AlertSeverity.Warning;
       if (stat.cpuUsage && stat.cpuUsage > threshold) {
         severity = AlertSeverity.Danger;
@@ -149,19 +144,6 @@ export class CpuUsageRule extends BaseRule {
       } as AlertMessageTimeToken,
     ];
 
-    if (stat.missingLimits) {
-      return {
-        text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.missingLimits', {
-          defaultMessage: `Kibana is configured for containerized workloads but node #start_link{nodeName}#end_link does not have resource limits configured. Fallback metric reports usage of {cpuUsage}%. Last checked at #absolute`,
-          values: {
-            nodeName: stat.nodeName,
-            cpuUsage: numeral(stat.cpuUsage).format(ROUNDED_FLOAT),
-          },
-        }),
-        tokens,
-      };
-    }
-
     if (stat.unexpectedLimits) {
       return {
         text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.unexpectedLimits', {
@@ -273,12 +255,7 @@ export class CpuUsageRule extends BaseRule {
   private getMessage(state: AlertCpuUsageState, clusterName: string, action: string) {
     const stat = state.meta as AlertCpuUsageNodeStats;
 
-    if (
-      stat.missingLimits ||
-      stat.limitsChanged ||
-      stat.unexpectedLimits ||
-      stat.cpuUsage === undefined
-    ) {
+    if (stat.limitsChanged || stat.unexpectedLimits || stat.cpuUsage === undefined) {
       return i18n.translate('xpack.monitoring.alerts.cpuUsage.firing.internalMessageForFailure', {
         defaultMessage: `CPU usage alert for node {nodeName} in cluster {clusterName} faced issues while evaluating the usage. {action}`,
         values: {

diff --git a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.test.ts b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.test.ts
@@ -126,10 +126,10 @@ describe('fetchCpuUsageNodeStats', () => {
                         value: 45,
                       },
                       quota_micros_max: {
-                        value: -1,
+                        value: 2000,
                       },
                       quota_micros_min: {
-                        value: -1,
+                        value: 2000,
                       },
                       name: {
                         buckets: [
@@ -366,7 +366,6 @@ describe('fetchCpuUsageNodeStats', () => {
 
       expect(stats).toEqual([
         {
-          missingLimits: true,
           clusterUuid: 'my-test-cluster',
           nodeId: 'my-test-node',
           nodeName: 'test-node',

diff --git a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts
@@ -179,14 +179,17 @@ async function fetchContainerStats(
       }
 
       const limitsNotSet = node.quota_micros_max.value === -1 && node.quota_micros_min.value === -1;
-      const notRunningInAContainer =
-        node.quota_micros_min.value === null && node.quota_micros_max.value === null;
-      if (limitsNotSet || notRunningInAContainer) {
+      if (limitsNotSet) {
+        const cpuUsage = node.average_cpu_usage_percent.value ?? undefined;
+
+        logger.warn(
+          `CPU usage rule: Node "${node.key}" does not have resource limits configured. Fallback metric reports usage of ${cpuUsage}%.`
+        );
+
         return {
-          missingLimits: true,
           clusterUuid: cluster.key as string,
           nodeId: node.key as string,
-          cpuUsage: node.average_cpu_usage_percent.value ?? undefined,
+          cpuUsage,
           nodeName,
           ccs,
         };
@@ -380,16 +383,17 @@ async function fetchNonContainerStats(
         ccs = index.includes(':') ? index.split(':')[0] : undefined;
       }
 
-      const runningInAContainer =
-        node.quota_micros_min.value !== null || node.quota_micros_max.value !== null;
+      const runningInAContainerWithLimits =
+        (node.quota_micros_min.value !== null && node.quota_micros_min.value !== -1) ||
+        (node.quota_micros_max.value !== null && node.quota_micros_max.value !== -1);
 
       return {
         clusterUuid: cluster.key as string,
         nodeId: node.key as string,
         cpuUsage: node.average_cpu.value ?? undefined,
         nodeName,
         ccs,
-        unexpectedLimits: runningInAContainer,
+        unexpectedLimits: runningInAContainerWithLimits,
       };
     });
   });