From ff0e34a797fc28446fc2efed5030bd453fa3505b Mon Sep 17 00:00:00 2001 From: Marco Antonio Ghiani Date: Tue, 26 Sep 2023 14:13:27 +0200 Subject: [PATCH 1/7] refactor(monitoring): update flows for cpu stats fetching --- .../server/lib/alerts/fetch_cpu_usage_node_stats.ts | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts index 014c38f447e1e..91f223b5eef2c 100644 --- a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts +++ b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts @@ -179,9 +179,7 @@ async function fetchContainerStats( } const limitsNotSet = node.quota_micros_max.value === -1 && node.quota_micros_min.value === -1; - const notRunningInAContainer = - node.quota_micros_min.value === null && node.quota_micros_max.value === null; - if (limitsNotSet || notRunningInAContainer) { + if (limitsNotSet) { return { missingLimits: true, clusterUuid: cluster.key as string, @@ -380,7 +378,7 @@ async function fetchNonContainerStats( ccs = index.includes(':') ? index.split(':')[0] : undefined; } - const runningInAContainer = + const runningInAContainerWithLimits = node.quota_micros_min.value !== null || node.quota_micros_max.value !== null; return { @@ -389,7 +387,7 @@ async function fetchNonContainerStats( cpuUsage: node.average_cpu.value ?? undefined, nodeName, ccs, - unexpectedLimits: runningInAContainer, + unexpectedLimits: runningInAContainerWithLimits, }; }); }); From 35c4133cae4af882911b0c2afc8feaf0c44889ef Mon Sep 17 00:00:00 2001 From: Marco Antonio Ghiani Date: Tue, 26 Sep 2023 15:08:11 +0200 Subject: [PATCH 2/7] refactor(monitoring): update conditional and log warning --- .../server/lib/alerts/fetch_cpu_usage_node_stats.ts | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts index 91f223b5eef2c..5511e18db553c 100644 --- a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts +++ b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts @@ -180,11 +180,16 @@ async function fetchContainerStats( const limitsNotSet = node.quota_micros_max.value === -1 && node.quota_micros_min.value === -1; if (limitsNotSet) { + const cpuUsage = node.average_cpu_usage_percent.value ?? undefined; + + logger.warn( + `CPU usage rule: Kibana is configured for containerized workloads but node "${node.key}" does not have resource limits configured. Fallback metric reports usage of ${cpuUsage}%.` + ); + return { - missingLimits: true, clusterUuid: cluster.key as string, nodeId: node.key as string, - cpuUsage: node.average_cpu_usage_percent.value ?? undefined, + cpuUsage, nodeName, ccs, }; @@ -379,7 +384,8 @@ async function fetchNonContainerStats( } const runningInAContainerWithLimits = - node.quota_micros_min.value !== null || node.quota_micros_max.value !== null; + (node.quota_micros_min.value !== null && node.quota_micros_min.value !== -1) || + (node.quota_micros_max.value !== null && node.quota_micros_max.value !== -1); return { clusterUuid: cluster.key as string, From 14c25441ce24612fc6cdad3dc30b9b8abed0f6ea Mon Sep 17 00:00:00 2001 From: Marco Antonio Ghiani Date: Tue, 26 Sep 2023 15:39:03 +0200 Subject: [PATCH 3/7] refactor(monitoring): update tests and remove unused warning --- .../plugins/monitoring/common/types/alerts.ts | 1 - .../server/alerts/cpu_usage_rule.test.ts | 40 ++++++++++++++----- .../server/alerts/cpu_usage_rule.ts | 27 +------------ .../alerts/fetch_cpu_usage_node_stats.test.ts | 5 +-- .../lib/alerts/fetch_cpu_usage_node_stats.ts | 2 +- 5 files changed, 35 insertions(+), 40 deletions(-) diff --git a/x-pack/plugins/monitoring/common/types/alerts.ts b/x-pack/plugins/monitoring/common/types/alerts.ts index 71943f42dd21f..adf00789d4056 100644 --- a/x-pack/plugins/monitoring/common/types/alerts.ts +++ b/x-pack/plugins/monitoring/common/types/alerts.ts @@ -171,7 +171,6 @@ export interface AlertNodeStats { export interface AlertCpuUsageNodeStats extends AlertNodeStats { cpuUsage?: number; limitsChanged?: boolean; - missingLimits?: boolean; unexpectedLimits?: boolean; } diff --git a/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.test.ts b/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.test.ts index 171daed9f9d64..dcf1e80583726 100644 --- a/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.test.ts +++ b/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.test.ts @@ -258,12 +258,7 @@ describe('CpuUsageRule', () => { it('should fire actions when resource limits are missing', async () => { (fetchCpuUsageNodeStats as jest.Mock).mockImplementation(() => { - return [ - { - ...stat, - missingLimits: true, - }, - ]; + return [stat]; }); const rule = new CpuUsageRule(); @@ -287,14 +282,39 @@ describe('CpuUsageRule', () => { nodeId, nodeName, threshold, - missingLimits: true, }, nodeId, nodeName, ui: { isFiring: true, message: { - text: `Kibana is configured for containerized workloads but node #start_linkmyNodeName#end_link does not have resource limits configured. Fallback metric reports usage of ${cpuUsage}%. Last checked at #absolute`, + text: `Node #start_link${nodeName}#end_link is reporting CPU usage of ${cpuUsage}% which is above the configured threshold of ${threshold}%. Last checked at #absolute`, + nextSteps: [ + { + text: '#start_linkCheck hot threads#end_link', + tokens: [ + { + startToken: '#start_link', + endToken: '#end_link', + type: 'docLink', + partialUrl: + '{elasticWebsiteUrl}guide/en/elasticsearch/reference/{docLinkVersion}/cluster-nodes-hot-threads.html', + }, + ], + }, + { + text: '#start_linkCheck long running tasks#end_link', + tokens: [ + { + startToken: '#start_link', + endToken: '#end_link', + type: 'docLink', + partialUrl: + '{elasticWebsiteUrl}guide/en/elasticsearch/reference/{docLinkVersion}/tasks.html', + }, + ], + }, + ], tokens: [ { startToken: '#start_link', @@ -319,8 +339,8 @@ describe('CpuUsageRule', () => { ], }); expect(scheduleActions).toHaveBeenCalledWith('default', { - internalFullMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`, - internalShortMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. Verify CPU usage of node.`, + internalFullMessage: `CPU usage alert is firing for node ${nodeName} in cluster ${clusterName}. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`, + internalShortMessage: `CPU usage alert is firing for node ${nodeName} in cluster ${clusterName}. Verify CPU usage of node.`, action: `[View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`, actionPlain: 'Verify CPU usage of node.', clusterName, diff --git a/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.ts b/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.ts index 58265dbfdbad7..49ab66f2ce10d 100644 --- a/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.ts +++ b/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.ts @@ -100,12 +100,7 @@ export class CpuUsageRule extends BaseRule { stat: AlertCpuUsageNodeStats, threshold: number ): { shouldFire: boolean; severity: AlertSeverity } { - if ( - stat.missingLimits || - stat.limitsChanged || - stat.unexpectedLimits || - stat.cpuUsage === undefined - ) { + if (stat.limitsChanged || stat.unexpectedLimits || stat.cpuUsage === undefined) { let severity = AlertSeverity.Warning; if (stat.cpuUsage && stat.cpuUsage > threshold) { severity = AlertSeverity.Danger; @@ -149,19 +144,6 @@ export class CpuUsageRule extends BaseRule { } as AlertMessageTimeToken, ]; - if (stat.missingLimits) { - return { - text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.missingLimits', { - defaultMessage: `Kibana is configured for containerized workloads but node #start_link{nodeName}#end_link does not have resource limits configured. Fallback metric reports usage of {cpuUsage}%. Last checked at #absolute`, - values: { - nodeName: stat.nodeName, - cpuUsage: numeral(stat.cpuUsage).format(ROUNDED_FLOAT), - }, - }), - tokens, - }; - } - if (stat.unexpectedLimits) { return { text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.unexpectedLimits', { @@ -273,12 +255,7 @@ export class CpuUsageRule extends BaseRule { private getMessage(state: AlertCpuUsageState, clusterName: string, action: string) { const stat = state.meta as AlertCpuUsageNodeStats; - if ( - stat.missingLimits || - stat.limitsChanged || - stat.unexpectedLimits || - stat.cpuUsage === undefined - ) { + if (stat.limitsChanged || stat.unexpectedLimits || stat.cpuUsage === undefined) { return i18n.translate('xpack.monitoring.alerts.cpuUsage.firing.internalMessageForFailure', { defaultMessage: `CPU usage alert for node {nodeName} in cluster {clusterName} faced issues while evaluating the usage. {action}`, values: { diff --git a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.test.ts b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.test.ts index 9551b30d1c2d2..caa4d701ea9ae 100644 --- a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.test.ts +++ b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.test.ts @@ -126,10 +126,10 @@ describe('fetchCpuUsageNodeStats', () => { value: 45, }, quota_micros_max: { - value: -1, + value: 2000, }, quota_micros_min: { - value: -1, + value: 2000, }, name: { buckets: [ @@ -366,7 +366,6 @@ describe('fetchCpuUsageNodeStats', () => { expect(stats).toEqual([ { - missingLimits: true, clusterUuid: 'my-test-cluster', nodeId: 'my-test-node', nodeName: 'test-node', diff --git a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts index 5511e18db553c..45636675d119a 100644 --- a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts +++ b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts @@ -183,7 +183,7 @@ async function fetchContainerStats( const cpuUsage = node.average_cpu_usage_percent.value ?? undefined; logger.warn( - `CPU usage rule: Kibana is configured for containerized workloads but node "${node.key}" does not have resource limits configured. Fallback metric reports usage of ${cpuUsage}%.` + `CPU usage rule: Node "${node.key}" does not have resource limits configured. Fallback metric reports usage of ${cpuUsage}%.` ); return { From 5ea419d50e149ac395136b0a5b286618179af8e9 Mon Sep 17 00:00:00 2001 From: Marco Antonio Ghiani Date: Wed, 27 Sep 2023 12:16:32 +0200 Subject: [PATCH 4/7] refactor(monitoring): reduce logs noise and aggregate conditions --- .../lib/alerts/fetch_cpu_usage_node_stats.ts | 82 ++++--------------- 1 file changed, 18 insertions(+), 64 deletions(-) diff --git a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts index 45636675d119a..5ccaa522c7368 100644 --- a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts +++ b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts @@ -14,14 +14,6 @@ import { MonitoringConfig } from '../../config'; import { getElasticsearchDataset, getIndexPatterns } from '../cluster/get_index_patterns'; import { createDatasetFilter } from './create_dataset_query_filter'; -interface CpuUsageFieldsWithValues { - 'max of node_stats.os.cgroup.cpu.cfs_quota_micros': number | null; - 'max of node_stats.os.cgroup.cpuacct.usage_nanos': number | null; - 'min of node_stats.os.cgroup.cpuacct.usage_nanos': number | null; - 'max of node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods': number | null; - 'min of node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods': number | null; -} - interface Options { esClient: ElasticsearchClient; clusterUuids: string[]; @@ -45,7 +37,7 @@ export async function fetchCpuUsageNodeStats( } async function fetchContainerStats( - { esClient, startMs, endMs, clusterUuids, filterQuery, logger }: Options, + { esClient, startMs, endMs, clusterUuids, filterQuery }: Options, config: MonitoringConfig ) { const indexPatterns = getIndexPatterns({ @@ -178,61 +170,34 @@ async function fetchContainerStats( ccs = index.includes(':') ? index.split(':')[0] : undefined; } - const limitsNotSet = node.quota_micros_max.value === -1 && node.quota_micros_min.value === -1; - if (limitsNotSet) { - const cpuUsage = node.average_cpu_usage_percent.value ?? undefined; - - logger.warn( - `CPU usage rule: Node "${node.key}" does not have resource limits configured. Fallback metric reports usage of ${cpuUsage}%.` - ); - - return { - clusterUuid: cluster.key as string, - nodeId: node.key as string, - cpuUsage, - nodeName, - ccs, - }; - } + const nodeStats = { + clusterUuid: cluster.key as string, + nodeId: node.key as string, + nodeName, + ccs, + }; - if (node.quota_micros_min.value !== node.quota_micros_max.value) { - return { - limitsChanged: true, - clusterUuid: cluster.key as string, - nodeId: node.key as string, - cpuUsage: undefined, - nodeName, - ccs, - }; - } + const limitsNotSet = node.quota_micros_max.value === -1 && node.quota_micros_min.value === -1; if ( + limitsNotSet || node.max_usage_nanos.value === null || node.min_usage_nanos.value === null || node.max_periods.value === null || node.min_periods.value === null || node.quota_micros_max.value === null ) { - logger.warn( - `CPU usage rule: Some aggregated values needed for container CPU usage calculation was empty: ${findEmptyValues( - { - 'max of node_stats.os.cgroup.cpu.cfs_quota_micros': node.quota_micros_max.value, - 'max of node_stats.os.cgroup.cpuacct.usage_nanos': node.max_usage_nanos.value, - 'min of node_stats.os.cgroup.cpuacct.usage_nanos': node.min_usage_nanos.value, - 'max of node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods': - node.max_periods.value, - 'min of node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods': - node.min_periods.value, - } - )}` - ); + return { + ...nodeStats, + cpuUsage: node.average_cpu_usage_percent.value ?? undefined, + }; + } + if (node.quota_micros_min.value !== node.quota_micros_max.value) { return { - clusterUuid: cluster.key as string, - nodeId: node.key as string, + ...nodeStats, + limitsChanged: true, cpuUsage: undefined, - nodeName, - ccs, }; } @@ -246,24 +211,13 @@ async function fetchContainerStats( ); return { - clusterUuid: cluster.key as string, - nodeId: node.key as string, + ...nodeStats, cpuUsage: Math.round(cpuUsage * 100) / 100, - nodeName, - ccs, }; }); }); } -function findEmptyValues(fieldsWithValues: CpuUsageFieldsWithValues): string { - const entries: Array<[string, number | null]> = Object.entries(fieldsWithValues); - return entries - .filter(([, value]) => value === null) - .map(([key]) => key) - .join(', '); -} - function computeCfsPercentCpuUsage(usageNanos: number, quotaMicros: number, periods: number) { // See https://github.com/elastic/kibana/pull/159351 for an explanation of this formula const quotaNanos = quotaMicros * 1000; From fff5166ae8f6c4192597c3c06af01f742d5db4f7 Mon Sep 17 00:00:00 2001 From: Marco Antonio Ghiani Date: Wed, 27 Sep 2023 13:10:07 +0200 Subject: [PATCH 5/7] refactor(monitoring): remove old test --- packages/kbn-timelion-grammar/index.d.ts | 3 + .../alerts/fetch_cpu_usage_node_stats.test.ts | 78 ------------------- 2 files changed, 3 insertions(+), 78 deletions(-) create mode 100644 packages/kbn-timelion-grammar/index.d.ts diff --git a/packages/kbn-timelion-grammar/index.d.ts b/packages/kbn-timelion-grammar/index.d.ts new file mode 100644 index 0000000000000..126df78746059 --- /dev/null +++ b/packages/kbn-timelion-grammar/index.d.ts @@ -0,0 +1,3 @@ +/// +declare const _exports: typeof import("*.peggy"); +export = _exports; diff --git a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.test.ts b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.test.ts index caa4d701ea9ae..214a7c04005f5 100644 --- a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.test.ts +++ b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.test.ts @@ -453,83 +453,5 @@ describe('fetchCpuUsageNodeStats', () => { }, ]); }); - - it('warns about failing to compute usage due to values missing', async () => { - esClient.search.mockResponse({ - aggregations: { - clusters: { - buckets: [ - { - key: 'my-test-cluster', - nodes: { - buckets: [ - { - key: 'my-test-node', - min_usage_nanos: { - value: null, - }, - max_usage_nanos: { - value: null, - }, - min_periods: { - value: null, - }, - max_periods: { - value: null, - }, - quota_micros_min: { - value: 10000, - }, - quota_micros_max: { - value: 10000, - }, - average_cpu_usage_percent: { - value: 45, - }, - name: { - buckets: [ - { - key: 'test-node', - }, - ], - }, - index: { - buckets: [ - { - key: 'a-local-index', - }, - ], - }, - }, - ], - }, - }, - ], - }, - }, - } as any); - - const stats = await fetchCpuUsageNodeStats( - { - esClient, - clusterUuids: ['my-test-cluster'], - startMs: 0, - endMs: 10, - filterQuery, - logger: loggerMock.create(), - }, - configSlice - ); - - expect(stats).toEqual([ - { - clusterUuid: 'my-test-cluster', - nodeId: 'my-test-node', - nodeName: 'test-node', - ccs: undefined, - cpuUsage: undefined, - }, - ]); - }); }); }); From 298cc5901238583a7a4de08669e7b07113f5e2f8 Mon Sep 17 00:00:00 2001 From: kibanamachine <42973632+kibanamachine@users.noreply.github.com> Date: Wed, 27 Sep 2023 11:16:24 +0000 Subject: [PATCH 6/7] [CI] Auto-commit changed files from 'node scripts/precommit_hook.js --ref HEAD~1..HEAD --fix' --- packages/kbn-timelion-grammar/index.d.ts | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/packages/kbn-timelion-grammar/index.d.ts b/packages/kbn-timelion-grammar/index.d.ts index 126df78746059..a2cb07a401719 100644 --- a/packages/kbn-timelion-grammar/index.d.ts +++ b/packages/kbn-timelion-grammar/index.d.ts @@ -1,3 +1,11 @@ -/// -declare const _exports: typeof import("*.peggy"); +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +// / +declare const _exports: typeof import('*.peggy'); export = _exports; From 16ed414a962286cfb89fe0fa8c18e4544eccba38 Mon Sep 17 00:00:00 2001 From: Marco Antonio Ghiani Date: Wed, 27 Sep 2023 14:30:39 +0200 Subject: [PATCH 7/7] refactor(monitoring): remove automatic file addition --- packages/kbn-timelion-grammar/index.d.ts | 11 ----------- 1 file changed, 11 deletions(-) delete mode 100644 packages/kbn-timelion-grammar/index.d.ts diff --git a/packages/kbn-timelion-grammar/index.d.ts b/packages/kbn-timelion-grammar/index.d.ts deleted file mode 100644 index a2cb07a401719..0000000000000 --- a/packages/kbn-timelion-grammar/index.d.ts +++ /dev/null @@ -1,11 +0,0 @@ -/* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the Elastic License - * 2.0 and the Server Side Public License, v 1; you may not use this file except - * in compliance with, at your election, the Elastic License 2.0 or the Server - * Side Public License, v 1. - */ - -// / -declare const _exports: typeof import('*.peggy'); -export = _exports;