Skip to content

Commit

Permalink
[8.12] [monitoring] Revert CPU Usage rule changes (#172913) (#172959)
Browse files Browse the repository at this point in the history
# Backport

This will backport the following commits from `main` to `8.12`:
- [[monitoring] Revert CPU Usage rule changes
(#172913)](#172913)

<!--- Backport version: 8.9.7 -->

### Questions ?
Please refer to the [Backport tool
documentation](https://github.com/sqren/backport)

<!--BACKPORT [{"author":{"name":"Milton
Hultgren","email":"[email protected]"},"sourceCommit":{"committedDate":"2023-12-08T15:25:23Z","message":"[monitoring]
Revert CPU Usage rule changes (#172913)\n\nReverts
https://github.com/elastic/kibana/pull/159351\r\nReverts
https://github.com/elastic/kibana/pull/167244\r\n\r\nDue to the many
unexpected issues that these changes introduced we've\r\ndecided to
revert these changes until we have better solutions for the\r\nproblems
we've learnt about.\r\n\r\nProblems:\r\n- Gaps in data cause alerts to
fire (see next point)\r\n- Normal CPU rescaling causes alerts to
fire\r\nhttps://github.com//issues/160905\r\n- Any error
fires an alert (since there is no other way to inform the\r\nuser about
the problems faced by the rule executor)\r\n- Many assumptions about
cgroups only being for container users are\r\nwrong\r\n\r\nTo address
some of these issues we also need more functionality in the\r\nalerting
framework to be able to register secondary actions so that we\r\nmay
trigger non-oncall workflows for when a rule faces issues
with\r\nevaluating the stats.\r\n\r\nOriginal issue
https://github.com/elastic/kibana/issues/116128","sha":"55bc6d505977e8831633cc76e0f46b2ca66ef559","branchLabelMapping":{"^v8.13.0$":"main","^v(\\d+).(\\d+).\\d+$":"$1.$2"}},"sourcePullRequest":{"labels":["release_note:fix","backport:prev-minor","v8.12.0","v8.13.0"],"number":172913,"url":"https://github.com/elastic/kibana/pull/172913","mergeCommit":{"message":"[monitoring]
Revert CPU Usage rule changes (#172913)\n\nReverts
https://github.com/elastic/kibana/pull/159351\r\nReverts
https://github.com/elastic/kibana/pull/167244\r\n\r\nDue to the many
unexpected issues that these changes introduced we've\r\ndecided to
revert these changes until we have better solutions for the\r\nproblems
we've learnt about.\r\n\r\nProblems:\r\n- Gaps in data cause alerts to
fire (see next point)\r\n- Normal CPU rescaling causes alerts to
fire\r\nhttps://github.com//issues/160905\r\n- Any error
fires an alert (since there is no other way to inform the\r\nuser about
the problems faced by the rule executor)\r\n- Many assumptions about
cgroups only being for container users are\r\nwrong\r\n\r\nTo address
some of these issues we also need more functionality in the\r\nalerting
framework to be able to register secondary actions so that we\r\nmay
trigger non-oncall workflows for when a rule faces issues
with\r\nevaluating the stats.\r\n\r\nOriginal issue
https://github.com/elastic/kibana/issues/116128","sha":"55bc6d505977e8831633cc76e0f46b2ca66ef559"}},"sourceBranch":"main","suggestedTargetBranches":["8.12"],"targetPullRequestStates":[{"branch":"8.12","label":"v8.12.0","labelRegex":"^v(\\d+).(\\d+).\\d+$","isSourceBranch":false,"state":"NOT_CREATED"},{"branch":"main","label":"v8.13.0","labelRegex":"^v8.13.0$","isSourceBranch":true,"state":"MERGED","url":"https://github.com/elastic/kibana/pull/172913","number":172913,"mergeCommit":{"message":"[monitoring]
Revert CPU Usage rule changes (#172913)\n\nReverts
https://github.com/elastic/kibana/pull/159351\r\nReverts
https://github.com/elastic/kibana/pull/167244\r\n\r\nDue to the many
unexpected issues that these changes introduced we've\r\ndecided to
revert these changes until we have better solutions for the\r\nproblems
we've learnt about.\r\n\r\nProblems:\r\n- Gaps in data cause alerts to
fire (see next point)\r\n- Normal CPU rescaling causes alerts to
fire\r\nhttps://github.com//issues/160905\r\n- Any error
fires an alert (since there is no other way to inform the\r\nuser about
the problems faced by the rule executor)\r\n- Many assumptions about
cgroups only being for container users are\r\nwrong\r\n\r\nTo address
some of these issues we also need more functionality in the\r\nalerting
framework to be able to register secondary actions so that we\r\nmay
trigger non-oncall workflows for when a rule faces issues
with\r\nevaluating the stats.\r\n\r\nOriginal issue
https://github.com/elastic/kibana/issues/116128","sha":"55bc6d505977e8831633cc76e0f46b2ca66ef559"}}]}]
BACKPORT-->

Co-authored-by: Milton Hultgren <[email protected]>
  • Loading branch information
kibanamachine and miltonhultgren authored Dec 8, 2023
1 parent 8506d96 commit b79c4b3
Show file tree
Hide file tree
Showing 10 changed files with 398 additions and 1,385 deletions.
7 changes: 4 additions & 3 deletions x-pack/plugins/monitoring/common/types/alerts.ts
Original file line number Diff line number Diff line change
Expand Up @@ -169,9 +169,10 @@ export interface AlertNodeStats {
}

export interface AlertCpuUsageNodeStats extends AlertNodeStats {
cpuUsage?: number;
limitsChanged?: boolean;
unexpectedLimits?: boolean;
cpuUsage: number;
containerUsage: number;
containerPeriods: number;
containerQuota: number;
}

export interface AlertThreadPoolRejectionsStats {
Expand Down
351 changes: 15 additions & 336 deletions x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.test.ts

Large diffs are not rendered by default.

187 changes: 59 additions & 128 deletions x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ import { ElasticsearchClient } from '@kbn/core/server';
import { Alert } from '@kbn/alerting-plugin/server';
import { RawAlertInstance, SanitizedRule } from '@kbn/alerting-plugin/common';
import { parseDuration } from '@kbn/alerting-plugin/common/parse_duration';
import { QueryDslQueryContainer } from '@elastic/elasticsearch/lib/api/types';
import { BaseRule } from './base_rule';
import {
AlertData,
Expand Down Expand Up @@ -47,7 +46,7 @@ export class CpuUsageRule extends BaseRule {
{
name: 'node',
description: i18n.translate('xpack.monitoring.alerts.cpuUsage.actionVariables.node', {
defaultMessage: 'The node reporting high CPU usage.',
defaultMessage: 'The node reporting high cpu usage.',
}),
},
...Object.values(AlertingDefaults.ALERT_TYPE.context),
Expand All @@ -63,52 +62,28 @@ export class CpuUsageRule extends BaseRule {
const duration = parseDuration(params.duration);
const endMs = +new Date();
const startMs = endMs - duration;

let filterQuery;
if (params.filterQuery) {
try {
filterQuery = JSON.parse(params.filterQuery) as QueryDslQueryContainer;
} catch (error) {
throw new Error(`Failed to parse filter query in CPU usage rule ${error}`);
}
}

const stats = await fetchCpuUsageNodeStats(
{
esClient,
clusterUuids: clusters.map((cluster) => cluster.clusterUuid),
startMs,
endMs,
filterQuery,
logger: this.scopedLogger,
},
Globals.app.config
esClient,
clusters,
startMs,
endMs,
Globals.app.config.ui.max_bucket_size,
params.filterQuery
);

return stats.map((stat) => ({
clusterUuid: stat.clusterUuid,
...this.outcomeAndSeverity(stat, params.threshold!),
meta: {
...stat,
threshold: params.threshold!,
},
ccs: stat.ccs,
}));
}

private outcomeAndSeverity(
stat: AlertCpuUsageNodeStats,
threshold: number
): { shouldFire: boolean; severity: AlertSeverity } {
if (stat.limitsChanged || stat.unexpectedLimits || stat.cpuUsage === undefined) {
let severity = AlertSeverity.Warning;
if (stat.cpuUsage && stat.cpuUsage > threshold) {
severity = AlertSeverity.Danger;
return stats.map((stat) => {
if (Globals.app.config.ui.container.elasticsearch.enabled) {
stat.cpuUsage =
(stat.containerUsage / (stat.containerPeriods * stat.containerQuota * 1000)) * 100;
}
return { shouldFire: true, severity };
}

return { shouldFire: stat.cpuUsage > threshold, severity: AlertSeverity.Danger };
return {
clusterUuid: stat.clusterUuid,
shouldFire: stat.cpuUsage > params.threshold!,
severity: AlertSeverity.Danger,
meta: stat,
ccs: stat.ccs,
};
});
}

protected filterAlertInstance(alertInstance: RawAlertInstance, filters: CommonAlertFilter[]) {
Expand All @@ -127,67 +102,13 @@ export class CpuUsageRule extends BaseRule {
}

protected getUiMessage(alertState: AlertState, item: AlertData): AlertMessage {
const stat = item.meta as AlertCpuUsageNodeStats & Pick<CommonAlertParams, 'threshold'>;
const tokens = [
{
startToken: '#start_link',
endToken: '#end_link',
type: AlertMessageTokenType.Link,
url: `elasticsearch/nodes/${stat.nodeId}`,
} as AlertMessageLinkToken,
{
startToken: '#absolute',
type: AlertMessageTokenType.Time,
isAbsolute: true,
isRelative: false,
timestamp: alertState.ui.triggeredMS,
} as AlertMessageTimeToken,
];

if (stat.unexpectedLimits) {
return {
text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.unexpectedLimits', {
defaultMessage: `Kibana is configured for non-containerized workloads but node #start_link{nodeName}#end_link has resource limits configured. Node reports usage of {cpuUsage}%. Last checked at #absolute`,
values: {
nodeName: stat.nodeName,
cpuUsage: numeral(stat.cpuUsage).format(ROUNDED_FLOAT),
},
}),
tokens,
};
}

if (stat.limitsChanged) {
return {
text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.limitsChanged', {
defaultMessage: `Resource limits for node #start_link{nodeName}#end_link has changed within the look back window, unable to confidently calculate CPU usage for alerting. Please monitor the usage until the window has moved. Last checked at #absolute`,
values: {
nodeName: stat.nodeName,
},
}),
tokens,
};
}

if (stat.cpuUsage === undefined) {
return {
text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.failedToComputeUsage', {
defaultMessage: `Failed to compute CPU usage for node #start_link{nodeName}#end_link. Please check the Kibana logs for more details. Last checked at #absolute`,
values: {
nodeName: stat.nodeName,
},
}),
tokens,
};
}

const stat = item.meta as AlertCpuUsageNodeStats;
return {
text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.firingMessage', {
defaultMessage: `Node #start_link{nodeName}#end_link is reporting CPU usage of {cpuUsage}% which is above the configured threshold of {threshold}%. Last checked at #absolute`,
defaultMessage: `Node #start_link{nodeName}#end_link is reporting cpu usage of {cpuUsage}% at #absolute`,
values: {
nodeName: stat.nodeName,
cpuUsage: numeral(stat.cpuUsage).format(ROUNDED_FLOAT),
threshold: stat.threshold,
},
}),
nextSteps: [
Expand All @@ -204,7 +125,21 @@ export class CpuUsageRule extends BaseRule {
`{elasticWebsiteUrl}guide/en/elasticsearch/reference/{docLinkVersion}/tasks.html`
),
],
tokens,
tokens: [
{
startToken: '#absolute',
type: AlertMessageTokenType.Time,
isAbsolute: true,
isRelative: false,
timestamp: alertState.ui.triggeredMS,
} as AlertMessageTimeToken,
{
startToken: '#start_link',
endToken: '#end_link',
type: AlertMessageTokenType.Link,
url: `elasticsearch/nodes/${stat.nodeId}`,
} as AlertMessageLinkToken,
],
};
}

Expand All @@ -222,7 +157,7 @@ export class CpuUsageRule extends BaseRule {
return;
}
const shortActionText = i18n.translate('xpack.monitoring.alerts.cpuUsage.shortAction', {
defaultMessage: 'Verify CPU usage of node.',
defaultMessage: 'Verify CPU level of node.',
});
const fullActionText = i18n.translate('xpack.monitoring.alerts.cpuUsage.fullAction', {
defaultMessage: 'View node',
Expand All @@ -234,8 +169,28 @@ export class CpuUsageRule extends BaseRule {
ccs
);
const action = `[${fullActionText}](${globalStateLink})`;
const internalShortMessage = this.getMessage(firingNode, cluster.clusterName, shortActionText);
const internalFullMessage = this.getMessage(firingNode, cluster.clusterName, action);
const internalShortMessage = i18n.translate(
'xpack.monitoring.alerts.cpuUsage.firing.internalShortMessage',
{
defaultMessage: `CPU usage alert is firing for node {nodeName} in cluster: {clusterName}. {shortActionText}`,
values: {
clusterName: cluster.clusterName,
nodeName: firingNode.nodeName,
shortActionText,
},
}
);
const internalFullMessage = i18n.translate(
'xpack.monitoring.alerts.cpuUsage.firing.internalFullMessage',
{
defaultMessage: `CPU usage alert is firing for node {nodeName} in cluster: {clusterName}. {action}`,
values: {
clusterName: cluster.clusterName,
nodeName: firingNode.nodeName,
action,
},
}
);
instance.scheduleActions('default', {
internalShortMessage,
internalFullMessage: Globals.app.isCloud ? internalShortMessage : internalFullMessage,
Expand All @@ -251,28 +206,4 @@ export class CpuUsageRule extends BaseRule {
actionPlain: shortActionText,
});
}

private getMessage(state: AlertCpuUsageState, clusterName: string, action: string) {
const stat = state.meta as AlertCpuUsageNodeStats;

if (stat.limitsChanged || stat.unexpectedLimits || stat.cpuUsage === undefined) {
return i18n.translate('xpack.monitoring.alerts.cpuUsage.firing.internalMessageForFailure', {
defaultMessage: `CPU usage alert for node {nodeName} in cluster {clusterName} faced issues while evaluating the usage. {action}`,
values: {
clusterName,
nodeName: state.nodeName,
action,
},
});
}

return i18n.translate('xpack.monitoring.alerts.cpuUsage.firing.internalMessage', {
defaultMessage: `CPU usage alert is firing for node {nodeName} in cluster {clusterName}. {action}`,
values: {
clusterName,
nodeName: state.nodeName,
action,
},
});
}
}
Loading

0 comments on commit b79c4b3

Please sign in to comment.