elastic · tonyghiani · Sep 28, 2023 · Sep 26, 2023 · Sep 26, 2023 · Sep 26, 2023
diff --git a/x-pack/plugins/monitoring/common/types/alerts.ts b/x-pack/plugins/monitoring/common/types/alerts.ts
@@ -171,7 +171,6 @@ export interface AlertNodeStats {
 export interface AlertCpuUsageNodeStats extends AlertNodeStats {
   cpuUsage?: number;
   limitsChanged?: boolean;
-  missingLimits?: boolean;
   unexpectedLimits?: boolean;
 }
 

diff --git a/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.test.ts b/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.test.ts
@@ -258,12 +258,7 @@ describe('CpuUsageRule', () => {
 
     it('should fire actions when resource limits are missing', async () => {
       (fetchCpuUsageNodeStats as jest.Mock).mockImplementation(() => {
-        return [
-          {
-            ...stat,
-            missingLimits: true,
-          },
-        ];
+        return [stat];
       });
 
       const rule = new CpuUsageRule();
@@ -287,14 +282,39 @@ describe('CpuUsageRule', () => {
               nodeId,
               nodeName,
               threshold,
-              missingLimits: true,
             },
             nodeId,
             nodeName,
             ui: {
               isFiring: true,
               message: {
-                text: `Kibana is configured for containerized workloads but node #start_linkmyNodeName#end_link does not have resource limits configured. Fallback metric reports usage of ${cpuUsage}%. Last checked at #absolute`,
+                text: `Node #start_link${nodeName}#end_link is reporting CPU usage of ${cpuUsage}% which is above the configured threshold of ${threshold}%. Last checked at #absolute`,
+                nextSteps: [
+                  {
+                    text: '#start_linkCheck hot threads#end_link',
+                    tokens: [
+                      {
+                        startToken: '#start_link',
+                        endToken: '#end_link',
+                        type: 'docLink',
+                        partialUrl:
+                          '{elasticWebsiteUrl}guide/en/elasticsearch/reference/{docLinkVersion}/cluster-nodes-hot-threads.html',
+                      },
+                    ],
+                  },
+                  {
+                    text: '#start_linkCheck long running tasks#end_link',
+                    tokens: [
+                      {
+                        startToken: '#start_link',
+                        endToken: '#end_link',
+                        type: 'docLink',
+                        partialUrl:
+                          '{elasticWebsiteUrl}guide/en/elasticsearch/reference/{docLinkVersion}/tasks.html',
+                      },
+                    ],
+                  },
+                ],
                 tokens: [
                   {
                     startToken: '#start_link',
@@ -319,8 +339,8 @@ describe('CpuUsageRule', () => {
         ],
       });
       expect(scheduleActions).toHaveBeenCalledWith('default', {
-        internalFullMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`,
-        internalShortMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. Verify CPU usage of node.`,
+        internalFullMessage: `CPU usage alert is firing for node ${nodeName} in cluster ${clusterName}. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`,
+        internalShortMessage: `CPU usage alert is firing for node ${nodeName} in cluster ${clusterName}. Verify CPU usage of node.`,
         action: `[View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`,
         actionPlain: 'Verify CPU usage of node.',
         clusterName,

diff --git a/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.ts b/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.ts
@@ -100,12 +100,7 @@ export class CpuUsageRule extends BaseRule {
     stat: AlertCpuUsageNodeStats,
     threshold: number
   ): { shouldFire: boolean; severity: AlertSeverity } {
-    if (
-      stat.missingLimits ||
-      stat.limitsChanged ||
-      stat.unexpectedLimits ||
-      stat.cpuUsage === undefined
-    ) {
+    if (stat.limitsChanged || stat.unexpectedLimits || stat.cpuUsage === undefined) {
       let severity = AlertSeverity.Warning;
       if (stat.cpuUsage && stat.cpuUsage > threshold) {
         severity = AlertSeverity.Danger;
@@ -149,19 +144,6 @@ export class CpuUsageRule extends BaseRule {
       } as AlertMessageTimeToken,
     ];
 
-    if (stat.missingLimits) {
-      return {
-        text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.missingLimits', {
-          defaultMessage: `Kibana is configured for containerized workloads but node #start_link{nodeName}#end_link does not have resource limits configured. Fallback metric reports usage of {cpuUsage}%. Last checked at #absolute`,
-          values: {
-            nodeName: stat.nodeName,
-            cpuUsage: numeral(stat.cpuUsage).format(ROUNDED_FLOAT),
-          },
-        }),
-        tokens,
-      };
-    }
-
     if (stat.unexpectedLimits) {
       return {
         text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.unexpectedLimits', {
@@ -273,12 +255,7 @@ export class CpuUsageRule extends BaseRule {
   private getMessage(state: AlertCpuUsageState, clusterName: string, action: string) {
     const stat = state.meta as AlertCpuUsageNodeStats;
 
-    if (
-      stat.missingLimits ||
-      stat.limitsChanged ||
-      stat.unexpectedLimits ||
-      stat.cpuUsage === undefined
-    ) {
+    if (stat.limitsChanged || stat.unexpectedLimits || stat.cpuUsage === undefined) {
       return i18n.translate('xpack.monitoring.alerts.cpuUsage.firing.internalMessageForFailure', {
         defaultMessage: `CPU usage alert for node {nodeName} in cluster {clusterName} faced issues while evaluating the usage. {action}`,
         values: {

diff --git a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.test.ts b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.test.ts
@@ -126,10 +126,10 @@ describe('fetchCpuUsageNodeStats', () => {
                         value: 45,
                       },
                       quota_micros_max: {
-                        value: -1,
+                        value: 2000,
                       },
                       quota_micros_min: {
-                        value: -1,
+                        value: 2000,
                       },
                       name: {
                         buckets: [
@@ -366,7 +366,6 @@ describe('fetchCpuUsageNodeStats', () => {
 
       expect(stats).toEqual([
         {
-          missingLimits: true,
           clusterUuid: 'my-test-cluster',
           nodeId: 'my-test-node',
           nodeName: 'test-node',
@@ -454,83 +453,5 @@ describe('fetchCpuUsageNodeStats', () => {
         },
       ]);
     });
-
-    it('warns about failing to compute usage due to values missing', async () => {
-      esClient.search.mockResponse({
-        aggregations: {
-          clusters: {
-            buckets: [
-              {
-                key: 'my-test-cluster',
-                nodes: {
-                  buckets: [
-                    {
-                      key: 'my-test-node',
-                      min_usage_nanos: {
-                        value: null,
-                      },
-                      max_usage_nanos: {
-                        value: null,
-                      },
-                      min_periods: {
-                        value: null,
-                      },
-                      max_periods: {
-                        value: null,
-                      },
-                      quota_micros_min: {
-                        value: 10000,
-                      },
-                      quota_micros_max: {
-                        value: 10000,
-                      },
-                      average_cpu_usage_percent: {
-                        value: 45,
-                      },
-                      name: {
-                        buckets: [
-                          {
-                            key: 'test-node',
-                          },
-                        ],
-                      },
-                      index: {
-                        buckets: [
-                          {
-                            key: 'a-local-index',
-                          },
-                        ],
-                      },
-                    },
-                  ],
-                },
-              },
-            ],
-          },
-        },
-      } as any);
-
-      const stats = await fetchCpuUsageNodeStats(
-        {
-          esClient,
-          clusterUuids: ['my-test-cluster'],
-          startMs: 0,
-          endMs: 10,
-          filterQuery,
-          logger: loggerMock.create(),
-        },
-        configSlice
-      );
-
-      expect(stats).toEqual([
-        {
-          clusterUuid: 'my-test-cluster',
-          nodeId: 'my-test-node',
-          nodeName: 'test-node',
-          ccs: undefined,
-          cpuUsage: undefined,
-        },
-      ]);
-    });
   });
 });
diff --git a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts
@@ -14,14 +14,6 @@ import { MonitoringConfig } from '../../config';
 import { getElasticsearchDataset, getIndexPatterns } from '../cluster/get_index_patterns';
 import { createDatasetFilter } from './create_dataset_query_filter';
 
-interface CpuUsageFieldsWithValues {
-  'max of node_stats.os.cgroup.cpu.cfs_quota_micros': number | null;
-  'max of node_stats.os.cgroup.cpuacct.usage_nanos': number | null;
-  'min of node_stats.os.cgroup.cpuacct.usage_nanos': number | null;
-  'max of node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods': number | null;
-  'min of node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods': number | null;
-}
-
 interface Options {
   esClient: ElasticsearchClient;
   clusterUuids: string[];
@@ -45,7 +37,7 @@ export async function fetchCpuUsageNodeStats(
 }
 
 async function fetchContainerStats(
-  { esClient, startMs, endMs, clusterUuids, filterQuery, logger }: Options,
+  { esClient, startMs, endMs, clusterUuids, filterQuery }: Options,
   config: MonitoringConfig
 ) {
   const indexPatterns = getIndexPatterns({
@@ -178,58 +170,34 @@ async function fetchContainerStats(
         ccs = index.includes(':') ? index.split(':')[0] : undefined;
       }
 
-      const limitsNotSet = node.quota_micros_max.value === -1 && node.quota_micros_min.value === -1;
-      const notRunningInAContainer =
-        node.quota_micros_min.value === null && node.quota_micros_max.value === null;
-      if (limitsNotSet || notRunningInAContainer) {
-        return {
-          missingLimits: true,
-          clusterUuid: cluster.key as string,
-          nodeId: node.key as string,
-          cpuUsage: node.average_cpu_usage_percent.value ?? undefined,
-          nodeName,
-          ccs,
-        };
-      }
+      const nodeStats = {
+        clusterUuid: cluster.key as string,
+        nodeId: node.key as string,
+        nodeName,
+        ccs,
+      };
 
-      if (node.quota_micros_min.value !== node.quota_micros_max.value) {
-        return {
-          limitsChanged: true,
-          clusterUuid: cluster.key as string,
-          nodeId: node.key as string,
-          cpuUsage: undefined,
-          nodeName,
-          ccs,
-        };
-      }
+      const limitsNotSet = node.quota_micros_max.value === -1 && node.quota_micros_min.value === -1;
 
       if (
+        limitsNotSet ||
         node.max_usage_nanos.value === null ||
         node.min_usage_nanos.value === null ||
         node.max_periods.value === null ||
         node.min_periods.value === null ||
         node.quota_micros_max.value === null
       ) {
-        logger.warn(
-          `CPU usage rule: Some aggregated values needed for container CPU usage calculation was empty: ${findEmptyValues(
-            {
-              'max of node_stats.os.cgroup.cpu.cfs_quota_micros': node.quota_micros_max.value,
-              'max of node_stats.os.cgroup.cpuacct.usage_nanos': node.max_usage_nanos.value,
-              'min of node_stats.os.cgroup.cpuacct.usage_nanos': node.min_usage_nanos.value,
-              'max of node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods':
-                node.max_periods.value,
-              'min of node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods':
-                node.min_periods.value,
-            }
-          )}`
-        );
+        return {
+          ...nodeStats,
+          cpuUsage: node.average_cpu_usage_percent.value ?? undefined,
+        };
+      }
 
+      if (node.quota_micros_min.value !== node.quota_micros_max.value) {
         return {
-          clusterUuid: cluster.key as string,
-          nodeId: node.key as string,
+          ...nodeStats,
+          limitsChanged: true,
           cpuUsage: undefined,
-          nodeName,
-          ccs,
         };
       }
 
@@ -243,24 +211,13 @@ async function fetchContainerStats(
       );
 
       return {
-        clusterUuid: cluster.key as string,
-        nodeId: node.key as string,
+        ...nodeStats,
         cpuUsage: Math.round(cpuUsage * 100) / 100,
-        nodeName,
-        ccs,
       };
     });
   });
 }
 
-function findEmptyValues(fieldsWithValues: CpuUsageFieldsWithValues): string {
-  const entries: Array<[string, number | null]> = Object.entries(fieldsWithValues);
-  return entries
-    .filter(([, value]) => value === null)
-    .map(([key]) => key)
-    .join(', ');
-}
-
 function computeCfsPercentCpuUsage(usageNanos: number, quotaMicros: number, periods: number) {
   // See https://github.com/elastic/kibana/pull/159351 for an explanation of this formula
   const quotaNanos = quotaMicros * 1000;
@@ -380,16 +337,17 @@ async function fetchNonContainerStats(
         ccs = index.includes(':') ? index.split(':')[0] : undefined;
       }
 
-      const runningInAContainer =
-        node.quota_micros_min.value !== null || node.quota_micros_max.value !== null;
+      const runningInAContainerWithLimits =
+        (node.quota_micros_min.value !== null && node.quota_micros_min.value !== -1) ||
+        (node.quota_micros_max.value !== null && node.quota_micros_max.value !== -1);
 
       return {
         clusterUuid: cluster.key as string,
         nodeId: node.key as string,
         cpuUsage: node.average_cpu.value ?? undefined,
         nodeName,
         ccs,
-        unexpectedLimits: runningInAContainer,
+        unexpectedLimits: runningInAContainerWithLimits,
       };
     });
   });