From 55bc6d505977e8831633cc76e0f46b2ca66ef559 Mon Sep 17 00:00:00 2001 From: Milton Hultgren Date: Fri, 8 Dec 2023 16:25:23 +0100 Subject: [PATCH] [monitoring] Revert CPU Usage rule changes (#172913) Reverts https://github.com/elastic/kibana/pull/159351 Reverts https://github.com/elastic/kibana/pull/167244 Due to the many unexpected issues that these changes introduced we've decided to revert these changes until we have better solutions for the problems we've learnt about. Problems: - Gaps in data cause alerts to fire (see next point) - Normal CPU rescaling causes alerts to fire https://github.com/elastic/kibana/issues/160905 - Any error fires an alert (since there is no other way to inform the user about the problems faced by the rule executor) - Many assumptions about cgroups only being for container users are wrong To address some of these issues we also need more functionality in the alerting framework to be able to register secondary actions so that we may trigger non-oncall workflows for when a rule faces issues with evaluating the stats. Original issue https://github.com/elastic/kibana/issues/116128 --- .../plugins/monitoring/common/types/alerts.ts | 7 +- .../server/alerts/cpu_usage_rule.test.ts | 351 +----------- .../server/alerts/cpu_usage_rule.ts | 187 ++----- .../fetch_cpu_usage_node_stats.test.ts.snap | 247 --------- .../alerts/fetch_cpu_usage_node_stats.test.ts | 519 ++++++------------ .../lib/alerts/fetch_cpu_usage_node_stats.ts | 444 +++++---------- x-pack/plugins/monitoring/tsconfig.json | 1 - .../translations/translations/fr-FR.json | 9 +- .../translations/translations/ja-JP.json | 9 +- .../translations/translations/zh-CN.json | 9 +- 10 files changed, 398 insertions(+), 1385 deletions(-) delete mode 100644 x-pack/plugins/monitoring/server/lib/alerts/__snapshots__/fetch_cpu_usage_node_stats.test.ts.snap diff --git a/x-pack/plugins/monitoring/common/types/alerts.ts b/x-pack/plugins/monitoring/common/types/alerts.ts index adf00789d4056..d00cc90c5516b 100644 --- a/x-pack/plugins/monitoring/common/types/alerts.ts +++ b/x-pack/plugins/monitoring/common/types/alerts.ts @@ -169,9 +169,10 @@ export interface AlertNodeStats { } export interface AlertCpuUsageNodeStats extends AlertNodeStats { - cpuUsage?: number; - limitsChanged?: boolean; - unexpectedLimits?: boolean; + cpuUsage: number; + containerUsage: number; + containerPeriods: number; + containerQuota: number; } export interface AlertThreadPoolRejectionsStats { diff --git a/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.test.ts b/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.test.ts index dcf1e80583726..6c5858d48e94e 100644 --- a/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.test.ts +++ b/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.test.ts @@ -42,7 +42,7 @@ describe('CpuUsageRule', () => { expect(rule.ruleOptions.throttle).toBe('1d'); expect(rule.ruleOptions.defaultParams).toStrictEqual({ threshold: 85, duration: '5m' }); expect(rule.ruleOptions.actionVariables).toStrictEqual([ - { name: 'node', description: 'The node reporting high CPU usage.' }, + { name: 'node', description: 'The node reporting high cpu usage.' }, { name: 'internalShortMessage', description: 'The short internal message generated by Elastic.', @@ -114,7 +114,7 @@ describe('CpuUsageRule', () => { getState.mockReset(); }); - it('should fire actions when threshold is exceeded', async () => { + it('should fire actions', async () => { const rule = new CpuUsageRule(); const type = rule.getRuleType(); await type.executor({ @@ -122,7 +122,6 @@ describe('CpuUsageRule', () => { params: rule.ruleOptions.defaultParams, } as any); const count = 1; - const threshold = rule.ruleOptions.defaultParams?.threshold; expect(replaceState).toHaveBeenCalledWith({ alertStates: [ { @@ -135,14 +134,13 @@ describe('CpuUsageRule', () => { cpuUsage, nodeId, nodeName, - threshold, }, nodeId, nodeName, ui: { isFiring: true, message: { - text: `Node #start_link${nodeName}#end_link is reporting CPU usage of ${cpuUsage}% which is above the configured threshold of ${threshold}%. Last checked at #absolute`, + text: `Node #start_link${nodeName}#end_link is reporting cpu usage of ${cpuUsage}% at #absolute`, nextSteps: [ { text: '#start_linkCheck hot threads#end_link', @@ -170,12 +168,6 @@ describe('CpuUsageRule', () => { }, ], tokens: [ - { - startToken: '#start_link', - endToken: '#end_link', - type: 'link', - url: 'elasticsearch/nodes/myNodeId', - }, { startToken: '#absolute', type: 'time', @@ -183,6 +175,12 @@ describe('CpuUsageRule', () => { isRelative: false, timestamp: 1, }, + { + startToken: '#start_link', + endToken: '#end_link', + type: 'link', + url: 'elasticsearch/nodes/myNodeId', + }, ], }, severity: 'danger', @@ -193,10 +191,10 @@ describe('CpuUsageRule', () => { ], }); expect(scheduleActions).toHaveBeenCalledWith('default', { - internalFullMessage: `CPU usage alert is firing for node ${nodeName} in cluster ${clusterName}. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`, - internalShortMessage: `CPU usage alert is firing for node ${nodeName} in cluster ${clusterName}. Verify CPU usage of node.`, + internalFullMessage: `CPU usage alert is firing for node ${nodeName} in cluster: ${clusterName}. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`, + internalShortMessage: `CPU usage alert is firing for node ${nodeName} in cluster: ${clusterName}. Verify CPU level of node.`, action: `[View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`, - actionPlain: 'Verify CPU usage of node.', + actionPlain: 'Verify CPU level of node.', clusterName, count, nodes: `${nodeName}:${cpuUsage}`, @@ -244,10 +242,10 @@ describe('CpuUsageRule', () => { } as any); const count = 1; expect(scheduleActions).toHaveBeenCalledWith('default', { - internalFullMessage: `CPU usage alert is firing for node ${nodeName} in cluster ${clusterName}. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid},ccs:${ccs}))`, - internalShortMessage: `CPU usage alert is firing for node ${nodeName} in cluster ${clusterName}. Verify CPU usage of node.`, + internalFullMessage: `CPU usage alert is firing for node ${nodeName} in cluster: ${clusterName}. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid},ccs:${ccs}))`, + internalShortMessage: `CPU usage alert is firing for node ${nodeName} in cluster: ${clusterName}. Verify CPU level of node.`, action: `[View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid},ccs:testCluster))`, - actionPlain: 'Verify CPU usage of node.', + actionPlain: 'Verify CPU level of node.', clusterName, count, nodes: `${nodeName}:${cpuUsage}`, @@ -255,324 +253,5 @@ describe('CpuUsageRule', () => { state: 'firing', }); }); - - it('should fire actions when resource limits are missing', async () => { - (fetchCpuUsageNodeStats as jest.Mock).mockImplementation(() => { - return [stat]; - }); - - const rule = new CpuUsageRule(); - const type = rule.getRuleType(); - await type.executor({ - ...executorOptions, - params: rule.ruleOptions.defaultParams, - } as any); - const count = 1; - const threshold = rule.ruleOptions.defaultParams?.threshold; - expect(replaceState).toHaveBeenCalledWith({ - alertStates: [ - { - ccs: undefined, - cluster: { clusterUuid, clusterName }, - cpuUsage, - itemLabel: undefined, - meta: { - clusterUuid, - cpuUsage, - nodeId, - nodeName, - threshold, - }, - nodeId, - nodeName, - ui: { - isFiring: true, - message: { - text: `Node #start_link${nodeName}#end_link is reporting CPU usage of ${cpuUsage}% which is above the configured threshold of ${threshold}%. Last checked at #absolute`, - nextSteps: [ - { - text: '#start_linkCheck hot threads#end_link', - tokens: [ - { - startToken: '#start_link', - endToken: '#end_link', - type: 'docLink', - partialUrl: - '{elasticWebsiteUrl}guide/en/elasticsearch/reference/{docLinkVersion}/cluster-nodes-hot-threads.html', - }, - ], - }, - { - text: '#start_linkCheck long running tasks#end_link', - tokens: [ - { - startToken: '#start_link', - endToken: '#end_link', - type: 'docLink', - partialUrl: - '{elasticWebsiteUrl}guide/en/elasticsearch/reference/{docLinkVersion}/tasks.html', - }, - ], - }, - ], - tokens: [ - { - startToken: '#start_link', - endToken: '#end_link', - type: 'link', - url: 'elasticsearch/nodes/myNodeId', - }, - { - startToken: '#absolute', - type: 'time', - isAbsolute: true, - isRelative: false, - timestamp: 1, - }, - ], - }, - severity: 'danger', - triggeredMS: 1, - lastCheckedMS: 0, - }, - }, - ], - }); - expect(scheduleActions).toHaveBeenCalledWith('default', { - internalFullMessage: `CPU usage alert is firing for node ${nodeName} in cluster ${clusterName}. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`, - internalShortMessage: `CPU usage alert is firing for node ${nodeName} in cluster ${clusterName}. Verify CPU usage of node.`, - action: `[View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`, - actionPlain: 'Verify CPU usage of node.', - clusterName, - count, - nodes: `${nodeName}:${cpuUsage}`, - node: `${nodeName}:${cpuUsage}`, - state: 'firing', - }); - }); - - it('should fire actions when resource limits have changed', async () => { - (fetchCpuUsageNodeStats as jest.Mock).mockImplementation(() => { - return [ - { - ...stat, - limitsChanged: true, - }, - ]; - }); - - const rule = new CpuUsageRule(); - const type = rule.getRuleType(); - await type.executor({ - ...executorOptions, - params: rule.ruleOptions.defaultParams, - } as any); - const count = 1; - const threshold = rule.ruleOptions.defaultParams?.threshold; - expect(replaceState).toHaveBeenCalledWith({ - alertStates: [ - { - ccs: undefined, - cluster: { clusterUuid, clusterName }, - cpuUsage, - itemLabel: undefined, - meta: { - clusterUuid, - cpuUsage, - nodeId, - nodeName, - threshold, - limitsChanged: true, - }, - nodeId, - nodeName, - ui: { - isFiring: true, - message: { - text: 'Resource limits for node #start_linkmyNodeName#end_link has changed within the look back window, unable to confidently calculate CPU usage for alerting. Please monitor the usage until the window has moved. Last checked at #absolute', - tokens: [ - { - startToken: '#start_link', - endToken: '#end_link', - type: 'link', - url: 'elasticsearch/nodes/myNodeId', - }, - { - startToken: '#absolute', - type: 'time', - isAbsolute: true, - isRelative: false, - timestamp: 1, - }, - ], - }, - severity: 'danger', - triggeredMS: 1, - lastCheckedMS: 0, - }, - }, - ], - }); - expect(scheduleActions).toHaveBeenCalledWith('default', { - internalFullMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`, - internalShortMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. Verify CPU usage of node.`, - action: `[View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`, - actionPlain: 'Verify CPU usage of node.', - clusterName, - count, - nodes: `${nodeName}:${cpuUsage}`, - node: `${nodeName}:${cpuUsage}`, - state: 'firing', - }); - }); - - it('should fire actions when resource limits are set but not expected', async () => { - (fetchCpuUsageNodeStats as jest.Mock).mockImplementation(() => { - return [ - { - ...stat, - unexpectedLimits: true, - }, - ]; - }); - - const rule = new CpuUsageRule(); - const type = rule.getRuleType(); - await type.executor({ - ...executorOptions, - params: rule.ruleOptions.defaultParams, - } as any); - const count = 1; - const threshold = rule.ruleOptions.defaultParams?.threshold; - expect(replaceState).toHaveBeenCalledWith({ - alertStates: [ - { - ccs: undefined, - cluster: { clusterUuid, clusterName }, - cpuUsage, - itemLabel: undefined, - meta: { - clusterUuid, - cpuUsage, - nodeId, - nodeName, - threshold, - unexpectedLimits: true, - }, - nodeId, - nodeName, - ui: { - isFiring: true, - message: { - text: `Kibana is configured for non-containerized workloads but node #start_linkmyNodeName#end_link has resource limits configured. Node reports usage of ${cpuUsage}%. Last checked at #absolute`, - tokens: [ - { - startToken: '#start_link', - endToken: '#end_link', - type: 'link', - url: 'elasticsearch/nodes/myNodeId', - }, - { - startToken: '#absolute', - type: 'time', - isAbsolute: true, - isRelative: false, - timestamp: 1, - }, - ], - }, - severity: 'danger', - triggeredMS: 1, - lastCheckedMS: 0, - }, - }, - ], - }); - expect(scheduleActions).toHaveBeenCalledWith('default', { - internalFullMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`, - internalShortMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. Verify CPU usage of node.`, - action: `[View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`, - actionPlain: 'Verify CPU usage of node.', - clusterName, - count, - nodes: `${nodeName}:${cpuUsage}`, - node: `${nodeName}:${cpuUsage}`, - state: 'firing', - }); - }); - - it('should fire actions when it fails to calculate CPU usage', async () => { - (fetchCpuUsageNodeStats as jest.Mock).mockImplementation(() => { - return [ - { - ...stat, - cpuUsage: undefined, - }, - ]; - }); - - const rule = new CpuUsageRule(); - const type = rule.getRuleType(); - await type.executor({ - ...executorOptions, - params: rule.ruleOptions.defaultParams, - } as any); - const count = 1; - const threshold = rule.ruleOptions.defaultParams?.threshold; - expect(replaceState).toHaveBeenCalledWith({ - alertStates: [ - { - ccs: undefined, - cluster: { clusterUuid, clusterName }, - cpuUsage: undefined, - itemLabel: undefined, - meta: { - clusterUuid, - cpuUsage: undefined, - nodeId, - nodeName, - threshold, - }, - nodeId, - nodeName, - ui: { - isFiring: true, - message: { - text: 'Failed to compute CPU usage for node #start_linkmyNodeName#end_link. Please check the Kibana logs for more details. Last checked at #absolute', - tokens: [ - { - startToken: '#start_link', - endToken: '#end_link', - type: 'link', - url: 'elasticsearch/nodes/myNodeId', - }, - { - startToken: '#absolute', - type: 'time', - isAbsolute: true, - isRelative: false, - timestamp: 1, - }, - ], - }, - severity: 'warning', - triggeredMS: 1, - lastCheckedMS: 0, - }, - }, - ], - }); - expect(scheduleActions).toHaveBeenCalledWith('default', { - internalFullMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`, - internalShortMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. Verify CPU usage of node.`, - action: `[View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`, - actionPlain: 'Verify CPU usage of node.', - clusterName, - count, - nodes: `${nodeName}:undefined`, - node: `${nodeName}:undefined`, - state: 'firing', - }); - }); }); }); diff --git a/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.ts b/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.ts index 49ab66f2ce10d..92c45c9e61ae2 100644 --- a/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.ts +++ b/x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.ts @@ -11,7 +11,6 @@ import { ElasticsearchClient } from '@kbn/core/server'; import { Alert } from '@kbn/alerting-plugin/server'; import { RawAlertInstance, SanitizedRule } from '@kbn/alerting-plugin/common'; import { parseDuration } from '@kbn/alerting-plugin/common/parse_duration'; -import { QueryDslQueryContainer } from '@elastic/elasticsearch/lib/api/types'; import { BaseRule } from './base_rule'; import { AlertData, @@ -47,7 +46,7 @@ export class CpuUsageRule extends BaseRule { { name: 'node', description: i18n.translate('xpack.monitoring.alerts.cpuUsage.actionVariables.node', { - defaultMessage: 'The node reporting high CPU usage.', + defaultMessage: 'The node reporting high cpu usage.', }), }, ...Object.values(AlertingDefaults.ALERT_TYPE.context), @@ -63,52 +62,28 @@ export class CpuUsageRule extends BaseRule { const duration = parseDuration(params.duration); const endMs = +new Date(); const startMs = endMs - duration; - - let filterQuery; - if (params.filterQuery) { - try { - filterQuery = JSON.parse(params.filterQuery) as QueryDslQueryContainer; - } catch (error) { - throw new Error(`Failed to parse filter query in CPU usage rule ${error}`); - } - } - const stats = await fetchCpuUsageNodeStats( - { - esClient, - clusterUuids: clusters.map((cluster) => cluster.clusterUuid), - startMs, - endMs, - filterQuery, - logger: this.scopedLogger, - }, - Globals.app.config + esClient, + clusters, + startMs, + endMs, + Globals.app.config.ui.max_bucket_size, + params.filterQuery ); - - return stats.map((stat) => ({ - clusterUuid: stat.clusterUuid, - ...this.outcomeAndSeverity(stat, params.threshold!), - meta: { - ...stat, - threshold: params.threshold!, - }, - ccs: stat.ccs, - })); - } - - private outcomeAndSeverity( - stat: AlertCpuUsageNodeStats, - threshold: number - ): { shouldFire: boolean; severity: AlertSeverity } { - if (stat.limitsChanged || stat.unexpectedLimits || stat.cpuUsage === undefined) { - let severity = AlertSeverity.Warning; - if (stat.cpuUsage && stat.cpuUsage > threshold) { - severity = AlertSeverity.Danger; + return stats.map((stat) => { + if (Globals.app.config.ui.container.elasticsearch.enabled) { + stat.cpuUsage = + (stat.containerUsage / (stat.containerPeriods * stat.containerQuota * 1000)) * 100; } - return { shouldFire: true, severity }; - } - return { shouldFire: stat.cpuUsage > threshold, severity: AlertSeverity.Danger }; + return { + clusterUuid: stat.clusterUuid, + shouldFire: stat.cpuUsage > params.threshold!, + severity: AlertSeverity.Danger, + meta: stat, + ccs: stat.ccs, + }; + }); } protected filterAlertInstance(alertInstance: RawAlertInstance, filters: CommonAlertFilter[]) { @@ -127,67 +102,13 @@ export class CpuUsageRule extends BaseRule { } protected getUiMessage(alertState: AlertState, item: AlertData): AlertMessage { - const stat = item.meta as AlertCpuUsageNodeStats & Pick; - const tokens = [ - { - startToken: '#start_link', - endToken: '#end_link', - type: AlertMessageTokenType.Link, - url: `elasticsearch/nodes/${stat.nodeId}`, - } as AlertMessageLinkToken, - { - startToken: '#absolute', - type: AlertMessageTokenType.Time, - isAbsolute: true, - isRelative: false, - timestamp: alertState.ui.triggeredMS, - } as AlertMessageTimeToken, - ]; - - if (stat.unexpectedLimits) { - return { - text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.unexpectedLimits', { - defaultMessage: `Kibana is configured for non-containerized workloads but node #start_link{nodeName}#end_link has resource limits configured. Node reports usage of {cpuUsage}%. Last checked at #absolute`, - values: { - nodeName: stat.nodeName, - cpuUsage: numeral(stat.cpuUsage).format(ROUNDED_FLOAT), - }, - }), - tokens, - }; - } - - if (stat.limitsChanged) { - return { - text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.limitsChanged', { - defaultMessage: `Resource limits for node #start_link{nodeName}#end_link has changed within the look back window, unable to confidently calculate CPU usage for alerting. Please monitor the usage until the window has moved. Last checked at #absolute`, - values: { - nodeName: stat.nodeName, - }, - }), - tokens, - }; - } - - if (stat.cpuUsage === undefined) { - return { - text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.failedToComputeUsage', { - defaultMessage: `Failed to compute CPU usage for node #start_link{nodeName}#end_link. Please check the Kibana logs for more details. Last checked at #absolute`, - values: { - nodeName: stat.nodeName, - }, - }), - tokens, - }; - } - + const stat = item.meta as AlertCpuUsageNodeStats; return { text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.firingMessage', { - defaultMessage: `Node #start_link{nodeName}#end_link is reporting CPU usage of {cpuUsage}% which is above the configured threshold of {threshold}%. Last checked at #absolute`, + defaultMessage: `Node #start_link{nodeName}#end_link is reporting cpu usage of {cpuUsage}% at #absolute`, values: { nodeName: stat.nodeName, cpuUsage: numeral(stat.cpuUsage).format(ROUNDED_FLOAT), - threshold: stat.threshold, }, }), nextSteps: [ @@ -204,7 +125,21 @@ export class CpuUsageRule extends BaseRule { `{elasticWebsiteUrl}guide/en/elasticsearch/reference/{docLinkVersion}/tasks.html` ), ], - tokens, + tokens: [ + { + startToken: '#absolute', + type: AlertMessageTokenType.Time, + isAbsolute: true, + isRelative: false, + timestamp: alertState.ui.triggeredMS, + } as AlertMessageTimeToken, + { + startToken: '#start_link', + endToken: '#end_link', + type: AlertMessageTokenType.Link, + url: `elasticsearch/nodes/${stat.nodeId}`, + } as AlertMessageLinkToken, + ], }; } @@ -222,7 +157,7 @@ export class CpuUsageRule extends BaseRule { return; } const shortActionText = i18n.translate('xpack.monitoring.alerts.cpuUsage.shortAction', { - defaultMessage: 'Verify CPU usage of node.', + defaultMessage: 'Verify CPU level of node.', }); const fullActionText = i18n.translate('xpack.monitoring.alerts.cpuUsage.fullAction', { defaultMessage: 'View node', @@ -234,8 +169,28 @@ export class CpuUsageRule extends BaseRule { ccs ); const action = `[${fullActionText}](${globalStateLink})`; - const internalShortMessage = this.getMessage(firingNode, cluster.clusterName, shortActionText); - const internalFullMessage = this.getMessage(firingNode, cluster.clusterName, action); + const internalShortMessage = i18n.translate( + 'xpack.monitoring.alerts.cpuUsage.firing.internalShortMessage', + { + defaultMessage: `CPU usage alert is firing for node {nodeName} in cluster: {clusterName}. {shortActionText}`, + values: { + clusterName: cluster.clusterName, + nodeName: firingNode.nodeName, + shortActionText, + }, + } + ); + const internalFullMessage = i18n.translate( + 'xpack.monitoring.alerts.cpuUsage.firing.internalFullMessage', + { + defaultMessage: `CPU usage alert is firing for node {nodeName} in cluster: {clusterName}. {action}`, + values: { + clusterName: cluster.clusterName, + nodeName: firingNode.nodeName, + action, + }, + } + ); instance.scheduleActions('default', { internalShortMessage, internalFullMessage: Globals.app.isCloud ? internalShortMessage : internalFullMessage, @@ -251,28 +206,4 @@ export class CpuUsageRule extends BaseRule { actionPlain: shortActionText, }); } - - private getMessage(state: AlertCpuUsageState, clusterName: string, action: string) { - const stat = state.meta as AlertCpuUsageNodeStats; - - if (stat.limitsChanged || stat.unexpectedLimits || stat.cpuUsage === undefined) { - return i18n.translate('xpack.monitoring.alerts.cpuUsage.firing.internalMessageForFailure', { - defaultMessage: `CPU usage alert for node {nodeName} in cluster {clusterName} faced issues while evaluating the usage. {action}`, - values: { - clusterName, - nodeName: state.nodeName, - action, - }, - }); - } - - return i18n.translate('xpack.monitoring.alerts.cpuUsage.firing.internalMessage', { - defaultMessage: `CPU usage alert is firing for node {nodeName} in cluster {clusterName}. {action}`, - values: { - clusterName, - nodeName: state.nodeName, - action, - }, - }); - } } diff --git a/x-pack/plugins/monitoring/server/lib/alerts/__snapshots__/fetch_cpu_usage_node_stats.test.ts.snap b/x-pack/plugins/monitoring/server/lib/alerts/__snapshots__/fetch_cpu_usage_node_stats.test.ts.snap deleted file mode 100644 index 9a06dcd7263d2..0000000000000 --- a/x-pack/plugins/monitoring/server/lib/alerts/__snapshots__/fetch_cpu_usage_node_stats.test.ts.snap +++ /dev/null @@ -1,247 +0,0 @@ -// Jest Snapshot v1, https://goo.gl/fbAQLP - -exports[`fetchCpuUsageNodeStats when running in a container calculates the containerized CPU usage 1`] = ` -Object { - "aggs": Object { - "clusters": Object { - "aggs": Object { - "nodes": Object { - "aggs": Object { - "average_cpu_usage_percent": Object { - "avg": Object { - "field": "node_stats.process.cpu.percent", - }, - }, - "index": Object { - "terms": Object { - "field": "_index", - "size": 1, - }, - }, - "max_periods": Object { - "max": Object { - "field": "node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods", - }, - }, - "max_usage_nanos": Object { - "max": Object { - "field": "node_stats.os.cgroup.cpuacct.usage_nanos", - }, - }, - "min_periods": Object { - "min": Object { - "field": "node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods", - }, - }, - "min_usage_nanos": Object { - "min": Object { - "field": "node_stats.os.cgroup.cpuacct.usage_nanos", - }, - }, - "name": Object { - "terms": Object { - "field": "source_node.name", - "size": 1, - }, - }, - "quota_micros_max": Object { - "max": Object { - "field": "node_stats.os.cgroup.cpu.cfs_quota_micros", - }, - }, - "quota_micros_min": Object { - "min": Object { - "field": "node_stats.os.cgroup.cpu.cfs_quota_micros", - }, - }, - }, - "terms": Object { - "field": "node_stats.node_id", - "size": 10, - }, - }, - }, - "terms": Object { - "field": "cluster_uuid", - "size": 10, - }, - }, - }, - "filter_path": Array [ - "aggregations", - ], - "index": ".monitoring-es-*,metrics-elasticsearch.stack_monitoring.node_stats-*", - "query": Object { - "bool": Object { - "filter": Array [ - Object { - "bool": Object { - "minimum_should_match": 1, - "should": Array [ - Object { - "term": Object { - "type": "node_stats", - }, - }, - Object { - "term": Object { - "metricset.name": "node_stats", - }, - }, - Object { - "term": Object { - "data_stream.dataset": "elasticsearch.stack_monitoring.node_stats", - }, - }, - ], - }, - }, - Object { - "terms": Object { - "cluster_uuid": Array [ - "my-test-cluster", - ], - }, - }, - Object { - "range": Object { - "timestamp": Object { - "format": "epoch_millis", - "gte": 0, - "lte": 10, - }, - }, - }, - Object { - "bool": Object { - "minimum_should_match": 1, - "should": Array [ - Object { - "term": Object { - "cluster_uuid": Object { - "value": "my-test-cluster", - }, - }, - }, - ], - }, - }, - ], - }, - }, - "size": 0, -} -`; - -exports[`fetchCpuUsageNodeStats when running outside a container calculates the CPU usage 1`] = ` -Object { - "aggs": Object { - "clusters": Object { - "aggs": Object { - "nodes": Object { - "aggs": Object { - "average_cpu": Object { - "avg": Object { - "field": "node_stats.process.cpu.percent", - }, - }, - "index": Object { - "terms": Object { - "field": "_index", - "size": 1, - }, - }, - "name": Object { - "terms": Object { - "field": "source_node.name", - "size": 1, - }, - }, - "quota_micros_max": Object { - "max": Object { - "field": "node_stats.os.cgroup.cpu.cfs_quota_micros", - }, - }, - "quota_micros_min": Object { - "min": Object { - "field": "node_stats.os.cgroup.cpu.cfs_quota_micros", - }, - }, - }, - "terms": Object { - "field": "node_stats.node_id", - "size": 10, - }, - }, - }, - "terms": Object { - "field": "cluster_uuid", - "size": 10, - }, - }, - }, - "filter_path": Array [ - "aggregations", - ], - "index": ".monitoring-es-*,metrics-elasticsearch.stack_monitoring.node_stats-*", - "query": Object { - "bool": Object { - "filter": Array [ - Object { - "bool": Object { - "minimum_should_match": 1, - "should": Array [ - Object { - "term": Object { - "type": "node_stats", - }, - }, - Object { - "term": Object { - "metricset.name": "node_stats", - }, - }, - Object { - "term": Object { - "data_stream.dataset": "elasticsearch.stack_monitoring.node_stats", - }, - }, - ], - }, - }, - Object { - "terms": Object { - "cluster_uuid": Array [ - "my-test-cluster", - ], - }, - }, - Object { - "range": Object { - "timestamp": Object { - "format": "epoch_millis", - "gte": 0, - "lte": 10, - }, - }, - }, - Object { - "bool": Object { - "minimum_should_match": 1, - "should": Array [ - Object { - "term": Object { - "cluster_uuid": Object { - "value": "my-test-cluster", - }, - }, - }, - ], - }, - }, - ], - }, - }, - "size": 0, -} -`; diff --git a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.test.ts b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.test.ts index 214a7c04005f5..77c96e8b6138a 100644 --- a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.test.ts +++ b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.test.ts @@ -5,75 +5,64 @@ * 2.0. */ +import type * as estypes from '@elastic/elasticsearch/lib/api/typesWithBodyKey'; import { elasticsearchClientMock } from '@kbn/core-elasticsearch-client-server-mocks'; -import { loggerMock } from '@kbn/logging-mocks'; import { fetchCpuUsageNodeStats } from './fetch_cpu_usage_node_stats'; -describe('fetchCpuUsageNodeStats', () => { - describe('when running outside a container', () => { - const esClient = elasticsearchClientMock.createScopedClusterClient().asCurrentUser; - - const configSlice: any = { - ui: { - ccs: { enabled: false }, - container: { - elasticsearch: { - enabled: false, - }, +jest.mock('../../static_globals', () => ({ + Globals: { + app: { + config: { + ui: { + ccs: { enabled: true }, }, - max_bucket_size: 10, }, - }; + }, + }, +})); - const filterQuery = { - bool: { - should: [ - { - term: { - cluster_uuid: { - value: 'my-test-cluster', - }, - }, - }, - ], - minimum_should_match: 1, - }, - }; +describe('fetchCpuUsageNodeStats', () => { + const esClient = elasticsearchClientMock.createScopedClusterClient().asCurrentUser; + const clusters = [ + { + clusterUuid: 'abc123', + clusterName: 'test', + }, + ]; + const startMs = 0; + const endMs = 0; + const size = 10; - it('calculates the CPU usage', async () => { - esClient.search.mockResponse({ + it('fetch normal stats', async () => { + esClient.search.mockResponse( + // @ts-expect-error not full response interface + { aggregations: { clusters: { buckets: [ { - key: 'my-test-cluster', + key: clusters[0].clusterUuid, nodes: { buckets: [ { - key: 'my-test-node', - average_cpu: { - value: 45, - }, - quota_micros_max: { - value: null, - }, - quota_micros_min: { - value: null, - }, - name: { + key: 'theNodeId', + index: { buckets: [ { - key: 'test-node', + key: '.monitoring-es-TODAY', }, ], }, - index: { + name: { buckets: [ { - key: 'a-local-index', + key: 'theNodeName', }, ], }, + average_cpu: { + value: 10, + }, }, ], }, @@ -81,70 +70,66 @@ describe('fetchCpuUsageNodeStats', () => { ], }, }, - } as any); - - const stats = await fetchCpuUsageNodeStats( - { - esClient, - clusterUuids: ['my-test-cluster'], - startMs: 0, - endMs: 10, - filterQuery, - logger: loggerMock.create(), - }, - configSlice - ); - - expect(stats).toEqual([ - { - clusterUuid: 'my-test-cluster', - nodeId: 'my-test-node', - nodeName: 'test-node', - ccs: undefined, - cpuUsage: 45, - unexpectedLimits: false, - }, - ]); - - // If this check fails, it means the query has changed which `might` mean the response shape has changed and - // the test data needs to be updated to reflect the new format. - expect(esClient.search.mock.calls[0][0]).toMatchSnapshot(); - }); + } + ); + const result = await fetchCpuUsageNodeStats(esClient, clusters, startMs, endMs, size); + expect(result).toEqual([ + { + clusterUuid: clusters[0].clusterUuid, + nodeName: 'theNodeName', + nodeId: 'theNodeId', + cpuUsage: 10, + containerUsage: undefined, + containerPeriods: undefined, + containerQuota: undefined, + ccs: null, + }, + ]); + }); - it('warns about container metrics being present', async () => { - esClient.search.mockResponse({ + it('fetch container stats', async () => { + esClient.search.mockResponse( + // @ts-expect-error not full response interface + { aggregations: { clusters: { buckets: [ { - key: 'my-test-cluster', + key: clusters[0].clusterUuid, nodes: { buckets: [ { - key: 'my-test-node', - average_cpu: { - value: 45, - }, - quota_micros_max: { - value: 2000, - }, - quota_micros_min: { - value: 2000, + key: 'theNodeId', + index: { + buckets: [ + { + key: '.monitoring-es-TODAY', + }, + ], }, name: { buckets: [ { - key: 'test-node', + key: 'theNodeName', }, ], }, - index: { + histo: { buckets: [ + null, { - key: 'a-local-index', + usage_deriv: { + normalized_value: 10, + }, + periods_deriv: { + normalized_value: 5, + }, }, ], }, + average_quota: { + value: 50, + }, }, ], }, @@ -152,115 +137,59 @@ describe('fetchCpuUsageNodeStats', () => { ], }, }, - } as any); - - const stats = await fetchCpuUsageNodeStats( - { - esClient, - clusterUuids: ['my-test-cluster'], - startMs: 0, - endMs: 10, - filterQuery, - logger: loggerMock.create(), - }, - configSlice - ); - - expect(stats).toEqual([ - { - unexpectedLimits: true, - clusterUuid: 'my-test-cluster', - nodeId: 'my-test-node', - nodeName: 'test-node', - ccs: undefined, - cpuUsage: 45, - }, - ]); - }); - }); - - describe('when running in a container', () => { - const esClient = elasticsearchClientMock.createScopedClusterClient().asCurrentUser; - - const configSlice: any = { - ui: { - ccs: { enabled: false }, - container: { - elasticsearch: { - enabled: true, - }, - }, - max_bucket_size: 10, - }, - }; - - const filterQuery = { - bool: { - should: [ - { - term: { - cluster_uuid: { - value: 'my-test-cluster', - }, - }, - }, - ], - minimum_should_match: 1, + } + ); + const result = await fetchCpuUsageNodeStats(esClient, clusters, startMs, endMs, size); + expect(result).toEqual([ + { + clusterUuid: clusters[0].clusterUuid, + nodeName: 'theNodeName', + nodeId: 'theNodeId', + cpuUsage: undefined, + containerUsage: 10, + containerPeriods: 5, + containerQuota: 50, + ccs: null, }, - }; - - it('calculates the containerized CPU usage', async () => { - // 45% CPU usage - const maxPeriods = 1000; - const quotaMicros = 100000; - const usageLimitNanos = maxPeriods * quotaMicros * 1000; - const maxUsageNanos = 0.45 * usageLimitNanos; + ]); + }); - esClient.search.mockResponse({ + it('fetch properly return ccs', async () => { + esClient.search.mockResponse( + // @ts-expect-error not full response interface + { aggregations: { clusters: { buckets: [ { - key: 'my-test-cluster', + key: clusters[0].clusterUuid, nodes: { buckets: [ { - key: 'my-test-node', - min_usage_nanos: { - value: 0, - }, - max_usage_nanos: { - value: maxUsageNanos, - }, - min_periods: { - value: 0, - }, - max_periods: { - value: maxPeriods, - }, - quota_micros_min: { - value: quotaMicros, - }, - quota_micros_max: { - value: quotaMicros, - }, - average_cpu_usage_percent: { - value: 45, - }, - name: { + key: 'theNodeId', + index: { buckets: [ { - key: 'test-node', + key: 'foo:.monitoring-es-TODAY', }, ], }, - index: { + name: { buckets: [ { - key: 'a-local-index', + key: 'theNodeName', }, ], }, + average_usage: { + value: 10, + }, + average_periods: { + value: 5, + }, + average_quota: { + value: 50, + }, }, ], }, @@ -268,190 +197,90 @@ describe('fetchCpuUsageNodeStats', () => { ], }, }, - } as any); - - const stats = await fetchCpuUsageNodeStats( - { - esClient, - clusterUuids: ['my-test-cluster'], - startMs: 0, - endMs: 10, - filterQuery, - logger: loggerMock.create(), - }, - configSlice - ); - - expect(stats).toEqual([ - { - clusterUuid: 'my-test-cluster', - nodeId: 'my-test-node', - nodeName: 'test-node', - ccs: undefined, - cpuUsage: 45, - }, - ]); + } + ); + const result = await fetchCpuUsageNodeStats(esClient, clusters, startMs, endMs, size); + expect(result[0].ccs).toBe('foo'); + }); - // If this check fails, it means the query has changed which `might` mean the response shape has changed and - // the test data needs to be updated to reflect the new format. - expect(esClient.search.mock.calls[0][0]).toMatchSnapshot(); + it('should use consistent params', async () => { + let params = null; + esClient.search.mockImplementation((...args) => { + params = args[0]; + return Promise.resolve({} as estypes.SearchResponse); }); - - it('warns about resource usage limits not being set', async () => { - esClient.search.mockResponse({ - aggregations: { - clusters: { - buckets: [ + const filterQuery = + '{"bool":{"should":[{"exists":{"field":"cluster_uuid"}}],"minimum_should_match":1}}'; + await fetchCpuUsageNodeStats(esClient, clusters, startMs, endMs, size, filterQuery); + expect(params).toStrictEqual({ + index: + '*:.monitoring-es-*,.monitoring-es-*,*:metrics-elasticsearch.stack_monitoring.node_stats-*,metrics-elasticsearch.stack_monitoring.node_stats-*', + filter_path: ['aggregations'], + body: { + size: 0, + query: { + bool: { + filter: [ + { terms: { cluster_uuid: ['abc123'] } }, { - key: 'my-test-cluster', - nodes: { - buckets: [ + bool: { + should: [ + { term: { type: 'node_stats' } }, + { term: { 'metricset.name': 'node_stats' } }, { - key: 'my-test-node', - min_usage_nanos: { - value: 0, - }, - max_usage_nanos: { - value: 1000, - }, - min_periods: { - value: 0, - }, - max_periods: { - value: 100, - }, - quota_micros_min: { - value: -1, - }, - quota_micros_max: { - value: -1, - }, - average_cpu_usage_percent: { - value: 45, - }, - name: { - buckets: [ - { - key: 'test-node', - }, - ], - }, - index: { - buckets: [ - { - key: 'a-local-index', - }, - ], - }, + term: { 'data_stream.dataset': 'elasticsearch.stack_monitoring.node_stats' }, }, ], + minimum_should_match: 1, }, }, + { range: { timestamp: { format: 'epoch_millis', gte: 0, lte: 0 } } }, + { + bool: { should: [{ exists: { field: 'cluster_uuid' } }], minimum_should_match: 1 }, + }, ], }, }, - } as any); - - const stats = await fetchCpuUsageNodeStats( - { - esClient, - clusterUuids: ['my-test-cluster'], - startMs: 0, - endMs: 10, - filterQuery, - logger: loggerMock.create(), - }, - configSlice - ); - - expect(stats).toEqual([ - { - clusterUuid: 'my-test-cluster', - nodeId: 'my-test-node', - nodeName: 'test-node', - ccs: undefined, - cpuUsage: 45, - }, - ]); - }); - - it('warns about resource usage limits being changed', async () => { - esClient.search.mockResponse({ - aggregations: { + aggs: { clusters: { - buckets: [ - { - key: 'my-test-cluster', - nodes: { - buckets: [ - { - key: 'my-test-node', - min_usage_nanos: { - value: 0, - }, - max_usage_nanos: { - value: 1000, - }, - min_periods: { - value: 0, - }, - max_periods: { - value: 100, - }, - quota_micros_min: { - value: -1, - }, - quota_micros_max: { - value: 10000, - }, - average_cpu_usage_percent: { - value: 45, - }, - name: { - buckets: [ - { - key: 'test-node', - }, - ], - }, - index: { - buckets: [ - { - key: 'a-local-index', - }, - ], + terms: { field: 'cluster_uuid', size: 10, include: ['abc123'] }, + aggs: { + nodes: { + terms: { field: 'node_stats.node_id', size: 10 }, + aggs: { + index: { terms: { field: '_index', size: 1 } }, + average_cpu: { avg: { field: 'node_stats.process.cpu.percent' } }, + average_quota: { avg: { field: 'node_stats.os.cgroup.cpu.cfs_quota_micros' } }, + name: { terms: { field: 'source_node.name', size: 1 } }, + histo: { + date_histogram: { field: 'timestamp', fixed_interval: '0m' }, + aggs: { + average_periods: { + max: { field: 'node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods' }, + }, + average_usage: { max: { field: 'node_stats.os.cgroup.cpuacct.usage_nanos' } }, + usage_deriv: { + derivative: { + buckets_path: 'average_usage', + gap_policy: 'skip', + unit: '1s', + }, + }, + periods_deriv: { + derivative: { + buckets_path: 'average_periods', + gap_policy: 'skip', + unit: '1s', + }, }, }, - ], + }, }, }, - ], + }, }, }, - } as any); - - const stats = await fetchCpuUsageNodeStats( - { - esClient, - clusterUuids: ['my-test-cluster'], - startMs: 0, - endMs: 10, - filterQuery, - logger: loggerMock.create(), - }, - configSlice - ); - - expect(stats).toEqual([ - { - limitsChanged: true, - clusterUuid: 'my-test-cluster', - nodeId: 'my-test-node', - nodeName: 'test-node', - ccs: undefined, - cpuUsage: undefined, - }, - ]); + }, }); }); }); diff --git a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts index 5ccaa522c7368..8037ad94e6764 100644 --- a/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts +++ b/x-pack/plugins/monitoring/server/lib/alerts/fetch_cpu_usage_node_stats.ts @@ -5,303 +5,139 @@ * 2.0. */ -import { QueryDslQueryContainer } from '@elastic/elasticsearch/lib/api/types'; -import { ElasticsearchClient, Logger } from '@kbn/core/server'; -import { InferSearchResponseOf } from '@kbn/es-types'; -import { CCS_REMOTE_PATTERN } from '../../../common/constants'; -import { AlertCpuUsageNodeStats } from '../../../common/types/alerts'; -import { MonitoringConfig } from '../../config'; -import { getElasticsearchDataset, getIndexPatterns } from '../cluster/get_index_patterns'; +import { ElasticsearchClient } from '@kbn/core/server'; +import { get } from 'lodash'; +import moment from 'moment'; +import { NORMALIZED_DERIVATIVE_UNIT } from '../../../common/constants'; +import { AlertCluster, AlertCpuUsageNodeStats } from '../../../common/types/alerts'; import { createDatasetFilter } from './create_dataset_query_filter'; +import { getIndexPatterns, getElasticsearchDataset } from '../cluster/get_index_patterns'; +import { Globals } from '../../static_globals'; +import { CCS_REMOTE_PATTERN } from '../../../common/constants'; + +interface NodeBucketESResponse { + key: string; + average_cpu: { value: number }; +} -interface Options { - esClient: ElasticsearchClient; - clusterUuids: string[]; - startMs: number; - endMs: number; - filterQuery?: QueryDslQueryContainer; - logger: Logger; +interface ClusterBucketESResponse { + key: string; + nodes: { + buckets: NodeBucketESResponse[]; + }; } export async function fetchCpuUsageNodeStats( - options: Options, - config: MonitoringConfig + esClient: ElasticsearchClient, + clusters: AlertCluster[], + startMs: number, + endMs: number, + size: number, + filterQuery?: string ): Promise { - if (config.ui.container.elasticsearch.enabled) { - options.logger.debug('CPU usage rule: Computing usage for containerized clusters'); - return fetchContainerStats(options, config); - } + // Using pure MS didn't seem to work well with the date_histogram interval + // but minutes does + const intervalInMinutes = moment.duration(endMs - startMs).asMinutes(); - options.logger.debug('CPU usage rule: Computing usage for non-containerized clusters'); - return fetchNonContainerStats(options, config); -} - -async function fetchContainerStats( - { esClient, startMs, endMs, clusterUuids, filterQuery }: Options, - config: MonitoringConfig -) { const indexPatterns = getIndexPatterns({ - config, + config: Globals.app.config, moduleType: 'elasticsearch', dataset: 'node_stats', ccs: CCS_REMOTE_PATTERN, }); - const params = { index: indexPatterns, filter_path: ['aggregations'], - size: 0, - query: { - bool: { - filter: [ - createDatasetFilter('node_stats', 'node_stats', getElasticsearchDataset('node_stats')), - { - terms: { - cluster_uuid: clusterUuids, - }, - }, - { - range: { - timestamp: { - format: 'epoch_millis', - gte: startMs, - lte: endMs, + body: { + size: 0, + query: { + bool: { + filter: [ + { + terms: { + cluster_uuid: clusters.map((cluster) => cluster.clusterUuid), }, }, - }, - ], - }, - }, - aggs: { - clusters: { - terms: { - field: 'cluster_uuid', - size: config.ui.max_bucket_size, - }, - aggs: { - nodes: { - terms: { - field: 'node_stats.node_id', - size: config.ui.max_bucket_size, - }, - aggs: { - name: { - terms: { - field: 'source_node.name', - size: 1, - }, - }, - // Used to check for CCS and get the remote cluster name - index: { - terms: { - field: '_index', - size: 1, - }, - }, - // Fallback value in case container limits are not specified - average_cpu_usage_percent: { - avg: { - field: 'node_stats.process.cpu.percent', - }, - }, - // Container limit min and max, to calculate usage and detect config changes - quota_micros_max: { - max: { - field: 'node_stats.os.cgroup.cpu.cfs_quota_micros', - }, - }, - quota_micros_min: { - min: { - field: 'node_stats.os.cgroup.cpu.cfs_quota_micros', - }, - }, - // Usage to calculate delta - max_usage_nanos: { - max: { - field: 'node_stats.os.cgroup.cpuacct.usage_nanos', - }, - }, - min_usage_nanos: { - min: { - field: 'node_stats.os.cgroup.cpuacct.usage_nanos', - }, - }, - // Periods to calculate delta - max_periods: { - max: { - field: 'node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods', - }, - }, - min_periods: { - min: { - field: 'node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods', + createDatasetFilter('node_stats', 'node_stats', getElasticsearchDataset('node_stats')), + { + range: { + timestamp: { + format: 'epoch_millis', + gte: startMs, + lte: endMs, }, }, }, - }, + ], }, }, - }, - }; - - if (filterQuery) { - (params.query!.bool!.filter! as QueryDslQueryContainer[]).push(filterQuery); - } - - const response = (await esClient.search(params)) as unknown as InferSearchResponseOf< - unknown, - typeof params - >; - - if (!response.aggregations) { - throw new Error('Failed to resolve needed aggregations for CPU Usage Rule'); - } - - return response.aggregations.clusters.buckets.flatMap((cluster) => { - return cluster.nodes.buckets.map((node): AlertCpuUsageNodeStats => { - let nodeName; - if (node.name.buckets.length) { - nodeName = node.name.buckets[0].key as string; - } - - let ccs; - if (node.index.buckets.length) { - const index = node.index.buckets[0].key as string; - ccs = index.includes(':') ? index.split(':')[0] : undefined; - } - - const nodeStats = { - clusterUuid: cluster.key as string, - nodeId: node.key as string, - nodeName, - ccs, - }; - - const limitsNotSet = node.quota_micros_max.value === -1 && node.quota_micros_min.value === -1; - - if ( - limitsNotSet || - node.max_usage_nanos.value === null || - node.min_usage_nanos.value === null || - node.max_periods.value === null || - node.min_periods.value === null || - node.quota_micros_max.value === null - ) { - return { - ...nodeStats, - cpuUsage: node.average_cpu_usage_percent.value ?? undefined, - }; - } - - if (node.quota_micros_min.value !== node.quota_micros_max.value) { - return { - ...nodeStats, - limitsChanged: true, - cpuUsage: undefined, - }; - } - - const usageDeltaNanos = node.max_usage_nanos.value - node.min_usage_nanos.value; - const periodsDelta = node.max_periods.value - node.min_periods.value; - - const cpuUsage = computeCfsPercentCpuUsage( - usageDeltaNanos, - node.quota_micros_max.value, - periodsDelta - ); - - return { - ...nodeStats, - cpuUsage: Math.round(cpuUsage * 100) / 100, - }; - }); - }); -} - -function computeCfsPercentCpuUsage(usageNanos: number, quotaMicros: number, periods: number) { - // See https://github.com/elastic/kibana/pull/159351 for an explanation of this formula - const quotaNanos = quotaMicros * 1000; - const limitNanos = quotaNanos * periods; - const usageAsFactor = usageNanos / limitNanos; - return usageAsFactor * 100; -} - -async function fetchNonContainerStats( - { esClient, startMs, endMs, clusterUuids, filterQuery }: Options, - config: MonitoringConfig -) { - const indexPatterns = getIndexPatterns({ - config, - moduleType: 'elasticsearch', - dataset: 'node_stats', - ccs: CCS_REMOTE_PATTERN, - }); - - const params = { - index: indexPatterns, - filter_path: ['aggregations'], - size: 0, - query: { - bool: { - filter: [ - createDatasetFilter('node_stats', 'node_stats', getElasticsearchDataset('node_stats')), - { - terms: { - cluster_uuid: clusterUuids, - }, - }, - { - range: { - timestamp: { - format: 'epoch_millis', - gte: startMs, - lte: endMs, - }, - }, + aggs: { + clusters: { + terms: { + field: 'cluster_uuid', + size, + include: clusters.map((cluster) => cluster.clusterUuid), }, - ], - }, - }, - aggs: { - clusters: { - terms: { - field: 'cluster_uuid', - size: config.ui.max_bucket_size, - }, - aggs: { - nodes: { - terms: { - field: 'node_stats.node_id', - size: config.ui.max_bucket_size, - }, - aggs: { - name: { - terms: { - field: 'source_node.name', - size: 1, + aggs: { + nodes: { + terms: { + field: 'node_stats.node_id', + size, + }, + aggs: { + index: { + terms: { + field: '_index', + size: 1, + }, }, - }, - // Used to check for CCS and get the remote cluster name - index: { - terms: { - field: '_index', - size: 1, + average_cpu: { + avg: { + field: 'node_stats.process.cpu.percent', + }, }, - }, - average_cpu: { - avg: { - field: 'node_stats.process.cpu.percent', + average_quota: { + avg: { + field: 'node_stats.os.cgroup.cpu.cfs_quota_micros', + }, }, - }, - // Container limit min and max, to detect possible config errors - quota_micros_max: { - max: { - field: 'node_stats.os.cgroup.cpu.cfs_quota_micros', + name: { + terms: { + field: 'source_node.name', + size: 1, + }, }, - }, - quota_micros_min: { - min: { - field: 'node_stats.os.cgroup.cpu.cfs_quota_micros', + histo: { + date_histogram: { + field: 'timestamp', + fixed_interval: `${intervalInMinutes}m`, + }, + aggs: { + average_periods: { + max: { + field: 'node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods', + }, + }, + average_usage: { + max: { + field: 'node_stats.os.cgroup.cpuacct.usage_nanos', + }, + }, + usage_deriv: { + derivative: { + buckets_path: 'average_usage', + gap_policy: 'skip' as const, + unit: NORMALIZED_DERIVATIVE_UNIT, + }, + }, + periods_deriv: { + derivative: { + buckets_path: 'average_periods', + gap_policy: 'skip' as const, + unit: NORMALIZED_DERIVATIVE_UNIT, + }, + }, + }, }, }, }, @@ -311,44 +147,38 @@ async function fetchNonContainerStats( }, }; - if (filterQuery) { - (params.query!.bool!.filter! as QueryDslQueryContainer[]).push(filterQuery); - } - - const response = (await esClient.search(params)) as unknown as InferSearchResponseOf< - unknown, - typeof params - >; - - if (!response.aggregations) { - throw new Error('Failed to resolve needed aggregations for CPU Usage Rule'); + try { + if (filterQuery) { + const filterQueryObject = JSON.parse(filterQuery); + params.body.query.bool.filter.push(filterQueryObject); + } + } catch (e) { + // meh } - return response.aggregations.clusters.buckets.flatMap((cluster) => { - return cluster.nodes.buckets.map((node): AlertCpuUsageNodeStats => { - let nodeName; - if (node.name.buckets.length) { - nodeName = node.name.buckets[0].key as string; - } - - let ccs; - if (node.index.buckets.length) { - const index = node.index.buckets[0].key as string; - ccs = index.includes(':') ? index.split(':')[0] : undefined; - } - - const runningInAContainerWithLimits = - (node.quota_micros_min.value !== null && node.quota_micros_min.value !== -1) || - (node.quota_micros_max.value !== null && node.quota_micros_max.value !== -1); - - return { - clusterUuid: cluster.key as string, - nodeId: node.key as string, - cpuUsage: node.average_cpu.value ?? undefined, - nodeName, - ccs, - unexpectedLimits: runningInAContainerWithLimits, + const response = await esClient.search(params); + const stats: AlertCpuUsageNodeStats[] = []; + const clusterBuckets = get( + response, + 'aggregations.clusters.buckets', + [] + ) as ClusterBucketESResponse[]; + for (const clusterBucket of clusterBuckets) { + for (const node of clusterBucket.nodes.buckets) { + const lastBucket = get(node, 'histo.buckets[1]', {}); + const indexName = get(node, 'index.buckets[0].key', ''); + const stat = { + clusterUuid: clusterBucket.key, + nodeId: node.key, + nodeName: get(node, 'name.buckets[0].key'), + cpuUsage: get(node, 'average_cpu.value'), + containerUsage: get(lastBucket, 'usage_deriv.normalized_value'), + containerPeriods: get(lastBucket, 'periods_deriv.normalized_value'), + containerQuota: get(node, 'average_quota.value'), + ccs: indexName.includes(':') ? indexName.split(':')[0] : null, }; - }); - }); + stats.push(stat); + } + } + return stats; } diff --git a/x-pack/plugins/monitoring/tsconfig.json b/x-pack/plugins/monitoring/tsconfig.json index d70d8b51fcd08..00ca962568141 100644 --- a/x-pack/plugins/monitoring/tsconfig.json +++ b/x-pack/plugins/monitoring/tsconfig.json @@ -41,7 +41,6 @@ "@kbn/shared-ux-router", "@kbn/observability-shared-plugin", "@kbn/shared-ux-link-redirect-app", - "@kbn/es-types", "@kbn/logs-shared-plugin", ], "exclude": [ diff --git a/x-pack/plugins/translations/translations/fr-FR.json b/x-pack/plugins/translations/translations/fr-FR.json index 488aa279b4c05..1f53d831fae6c 100644 --- a/x-pack/plugins/translations/translations/fr-FR.json +++ b/x-pack/plugins/translations/translations/fr-FR.json @@ -27082,12 +27082,9 @@ "xpack.monitoring.alerts.clusterHealth.firing.internalShortMessage": "L'alerte d'intégrité de cluster se déclenche pour {clusterName}. L'intégrité actuelle est {health}. {actionText}", "xpack.monitoring.alerts.clusterHealth.ui.firingMessage": "L'intégrité du cluster Elasticsearch est {health}.", "xpack.monitoring.alerts.clusterHealth.ui.nextSteps.message1": "{message}. #start_linkView now#end_link", - "xpack.monitoring.alerts.cpuUsage.firing.internalMessage": "L'alerte d'utilisation CPU se déclenche pour le nœud {nodeName} dans les cluster {clusterName}. {action}", - "xpack.monitoring.alerts.cpuUsage.firing.internalMessageForFailure": "L'alerte d'utilisation CPU pour le nœud {nodeName} dans le cluster {clusterName} a rencontré des problèmes lors de l'évaluation de l'utilisation. {action}", - "xpack.monitoring.alerts.cpuUsage.ui.failedToComputeUsage": "Le calcul de l'utilisation du CPU pour le nœud #start_link{nodeName}#end_link a échoué. Pour en savoir plus, veuillez consulter les logs Kibana. Dernière vérification : #absolute", - "xpack.monitoring.alerts.cpuUsage.ui.firingMessage": "Le nœud #start_link{nodeName}#end_link signale une utilisation du CPU de {cpuUsage} %, ce qui est supérieur au seuil configuré de {threshold} %. Dernière vérification : #absolute", - "xpack.monitoring.alerts.cpuUsage.ui.limitsChanged": "Les limites de ressources pour le nœud #start_link{nodeName}#end_link ont changé dans la fenêtre de visualisation. Impossible de calculer avec assurance l'utilisation du CPU pour les alertes. Veuillez monitorer l'utilisation jusqu'à ce que la fenêtre soit déplacée. Dernière vérification : #absolute", - "xpack.monitoring.alerts.cpuUsage.ui.unexpectedLimits": "Kibana est configuré pour les charges de travail non conteneurisées mais le nœud #start_link{nodeName}#end_link dispose de limites de ressources configurées. Le nœud signale une utilisation de {cpuUsage} %. Dernière vérification : #absolute", + "xpack.monitoring.alerts.cpuUsage.firing.internalFullMessage": "L'alerte d'utilisation CPU se déclenche pour le nœud {nodeName} dans le cluster : {clusterName}. {action}", + "xpack.monitoring.alerts.cpuUsage.firing.internalShortMessage": "L'alerte d'utilisation CPU se déclenche pour le nœud {nodeName} dans le cluster : {clusterName}. {shortActionText}", + "xpack.monitoring.alerts.cpuUsage.ui.firingMessage": "Le nœud #start_link{nodeName}#end_link signale une utilisation CPU de {cpuUsage} % à #absolute", "xpack.monitoring.alerts.diskUsage.firing.internalFullMessage": "L'alerte d'utilisation du disque se déclenche pour le nœud {nodeName} dans le cluster : {clusterName}. {action}", "xpack.monitoring.alerts.diskUsage.firing.internalShortMessage": "L'alerte d'utilisation du disque se déclenche pour le nœud {nodeName} dans le cluster : {clusterName}. {shortActionText}", "xpack.monitoring.alerts.diskUsage.ui.firingMessage": "Le nœud #start_link{nodeName}#end_link signale une utilisation du disque de {diskUsage} % à #absolute", diff --git a/x-pack/plugins/translations/translations/ja-JP.json b/x-pack/plugins/translations/translations/ja-JP.json index c09f29875cb57..5d0a766d14300 100644 --- a/x-pack/plugins/translations/translations/ja-JP.json +++ b/x-pack/plugins/translations/translations/ja-JP.json @@ -27082,12 +27082,9 @@ "xpack.monitoring.alerts.clusterHealth.firing.internalShortMessage": "クラスター正常性アラートが{clusterName}に対して作動しています。現在のヘルスは{health}です。{actionText}", "xpack.monitoring.alerts.clusterHealth.ui.firingMessage": "Elasticsearchクラスターの正常性は{health}です。", "xpack.monitoring.alerts.clusterHealth.ui.nextSteps.message1": "{message}. #start_linkView now#end_link", - "xpack.monitoring.alerts.cpuUsage.firing.internalMessage": "クラスター{clusterName}のノード{nodeName}について、CPU使用率のアラートが発生しています。{action}", - "xpack.monitoring.alerts.cpuUsage.firing.internalMessageForFailure": "クラスター{clusterName}のノード{nodeName}のCPU使用率アラートでは、使用率の評価中に問題が発生しました。{action}", - "xpack.monitoring.alerts.cpuUsage.ui.failedToComputeUsage": "ノード#start_link{nodeName}#end_linkのCPU使用率の計算に失敗しました。詳細については、Kibanaログを確認してください。最終確認 #absolute", - "xpack.monitoring.alerts.cpuUsage.ui.firingMessage": "ノード#start_link{nodeName}#end_linkのCPU使用率が{cpuUsage}%で、設定されたしきい値{threshold}%を超えています。最終確認 #absolute", - "xpack.monitoring.alerts.cpuUsage.ui.limitsChanged": "ノード#start_link{nodeName}#end_linkのリソース制限がルックバックウィンドウ内で変更されたため、アラート用のCPU使用率を正確に計算できません。ウィンドウが移動するまで、使用状況を監視してください。最終確認 #absolute", - "xpack.monitoring.alerts.cpuUsage.ui.unexpectedLimits": "Kibanaはコンテナー化されていないワークロード用に構成されていますが、ノード#start_link{nodeName}#end_linkにはリソース制限が設定されています。ノードは使用率{cpuUsage}%を報告しています。最終確認 #absolute", + "xpack.monitoring.alerts.cpuUsage.firing.internalFullMessage": "クラスター{clusterName}のノード{nodeName}について、CPU使用率のアラートが発生しています。{action}", + "xpack.monitoring.alerts.cpuUsage.firing.internalShortMessage": "クラスター{clusterName}のノード{nodeName}について、CPU使用率のアラートが発生しています。{shortActionText}", + "xpack.monitoring.alerts.cpuUsage.ui.firingMessage": "ノード#start_link{nodeName}#end_linkは、#absoluteでCPU使用率{cpuUsage}%を報告しています", "xpack.monitoring.alerts.diskUsage.firing.internalFullMessage": "クラスター{clusterName}のノード{nodeName}について、ディスク使用率のアラートが発生しています。{action}", "xpack.monitoring.alerts.diskUsage.firing.internalShortMessage": "クラスター{clusterName}のノード{nodeName}について、ディスク使用率のアラートが発生しています。{shortActionText}", "xpack.monitoring.alerts.diskUsage.ui.firingMessage": "ノード#start_link{nodeName}#end_linkは、#absoluteでディスク使用率{diskUsage}%を報告しています", diff --git a/x-pack/plugins/translations/translations/zh-CN.json b/x-pack/plugins/translations/translations/zh-CN.json index 6b809caf229bd..084532e4917d1 100644 --- a/x-pack/plugins/translations/translations/zh-CN.json +++ b/x-pack/plugins/translations/translations/zh-CN.json @@ -27080,12 +27080,9 @@ "xpack.monitoring.alerts.clusterHealth.firing.internalShortMessage": "为 {clusterName} 触发了集群运行状况告警。当前运行状况为 {health}。{actionText}", "xpack.monitoring.alerts.clusterHealth.ui.firingMessage": "Elasticsearch 集群运行状况为 {health}。", "xpack.monitoring.alerts.clusterHealth.ui.nextSteps.message1": "{message}. #start_linkView now#end_link", - "xpack.monitoring.alerts.cpuUsage.firing.internalMessage": "集群 {clusterName} 中的节点 {nodeName} 触发了 CPU 使用率告警。{action}", - "xpack.monitoring.alerts.cpuUsage.firing.internalMessageForFailure": "评估使用率时,集群 {clusterName} 中节点 {nodeName} 的 CPU 使用率告警出现问题。{action}", - "xpack.monitoring.alerts.cpuUsage.ui.failedToComputeUsage": "无法计算节点 #start_link{nodeName}#end_link 的 CPU 使用率。请检查 Kibana 日志了解更多详情。上次检查时间为 #absolute", - "xpack.monitoring.alerts.cpuUsage.ui.firingMessage": "节点 #start_link{nodeName}#end_link 报告 CPU 使用率为 {cpuUsage}%,这超出了配置的阈值 {threshold}%。上次检查时间为 #absolute", - "xpack.monitoring.alerts.cpuUsage.ui.limitsChanged": "节点 #start_link{nodeName}#end_link 的资源限制已在回溯时间窗口内更改,无法放心用于计算 CPU 使用率以进行告警。请监测使用率,直到时间窗口已过去。上次检查时间为 #absolute", - "xpack.monitoring.alerts.cpuUsage.ui.unexpectedLimits": "已为非容器化工作负载配置 Kibana,但节点 #start_link{nodeName}#end_link 具有配置的资源限制。节点报告使用率为 {cpuUsage}%。上次检查时间为 #absolute", + "xpack.monitoring.alerts.cpuUsage.firing.internalFullMessage": "集群 {clusterName} 中的节点 {nodeName} 触发了 CPU 使用率告警。{action}", + "xpack.monitoring.alerts.cpuUsage.firing.internalShortMessage": "集群 {clusterName} 中的节点 {nodeName} 触发了 CPU 使用率告警。{shortActionText}", + "xpack.monitoring.alerts.cpuUsage.ui.firingMessage": "节点 #start_link{nodeName}#end_link 于 #absolute报告 cpu 使用率为 {cpuUsage}%", "xpack.monitoring.alerts.diskUsage.firing.internalFullMessage": "集群 {clusterName} 中的节点 {nodeName} 触发了磁盘使用率告警。{action}", "xpack.monitoring.alerts.diskUsage.firing.internalShortMessage": "集群 {clusterName} 中的节点 {nodeName} 触发了磁盘使用率告警。{shortActionText}", "xpack.monitoring.alerts.diskUsage.ui.firingMessage": "节点 #start_link{nodeName}#end_link 于 #absolute 报告磁盘使用率为 {diskUsage}%",