Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Stack Monitoring] Update flows for cpu stats fetching #167244

Merged
merged 16 commits into from
Sep 28, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion x-pack/plugins/monitoring/common/types/alerts.ts
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,6 @@ export interface AlertNodeStats {
export interface AlertCpuUsageNodeStats extends AlertNodeStats {
cpuUsage?: number;
limitsChanged?: boolean;
missingLimits?: boolean;
unexpectedLimits?: boolean;
}

Expand Down
40 changes: 30 additions & 10 deletions x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -258,12 +258,7 @@ describe('CpuUsageRule', () => {

it('should fire actions when resource limits are missing', async () => {
(fetchCpuUsageNodeStats as jest.Mock).mockImplementation(() => {
return [
{
...stat,
missingLimits: true,
},
];
return [stat];
});

const rule = new CpuUsageRule();
Expand All @@ -287,14 +282,39 @@ describe('CpuUsageRule', () => {
nodeId,
nodeName,
threshold,
missingLimits: true,
},
nodeId,
nodeName,
ui: {
isFiring: true,
message: {
text: `Kibana is configured for containerized workloads but node #start_linkmyNodeName#end_link does not have resource limits configured. Fallback metric reports usage of ${cpuUsage}%. Last checked at #absolute`,
text: `Node #start_link${nodeName}#end_link is reporting CPU usage of ${cpuUsage}% which is above the configured threshold of ${threshold}%. Last checked at #absolute`,
nextSteps: [
{
text: '#start_linkCheck hot threads#end_link',
tokens: [
{
startToken: '#start_link',
endToken: '#end_link',
type: 'docLink',
partialUrl:
'{elasticWebsiteUrl}guide/en/elasticsearch/reference/{docLinkVersion}/cluster-nodes-hot-threads.html',
},
],
},
{
text: '#start_linkCheck long running tasks#end_link',
tokens: [
{
startToken: '#start_link',
endToken: '#end_link',
type: 'docLink',
partialUrl:
'{elasticWebsiteUrl}guide/en/elasticsearch/reference/{docLinkVersion}/tasks.html',
},
],
},
],
tokens: [
{
startToken: '#start_link',
Expand All @@ -319,8 +339,8 @@ describe('CpuUsageRule', () => {
],
});
expect(scheduleActions).toHaveBeenCalledWith('default', {
internalFullMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`,
internalShortMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. Verify CPU usage of node.`,
internalFullMessage: `CPU usage alert is firing for node ${nodeName} in cluster ${clusterName}. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`,
internalShortMessage: `CPU usage alert is firing for node ${nodeName} in cluster ${clusterName}. Verify CPU usage of node.`,
action: `[View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`,
actionPlain: 'Verify CPU usage of node.',
clusterName,
Expand Down
27 changes: 2 additions & 25 deletions x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.ts
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,7 @@ export class CpuUsageRule extends BaseRule {
stat: AlertCpuUsageNodeStats,
threshold: number
): { shouldFire: boolean; severity: AlertSeverity } {
if (
stat.missingLimits ||
stat.limitsChanged ||
stat.unexpectedLimits ||
stat.cpuUsage === undefined
) {
if (stat.limitsChanged || stat.unexpectedLimits || stat.cpuUsage === undefined) {
let severity = AlertSeverity.Warning;
if (stat.cpuUsage && stat.cpuUsage > threshold) {
severity = AlertSeverity.Danger;
Expand Down Expand Up @@ -149,19 +144,6 @@ export class CpuUsageRule extends BaseRule {
} as AlertMessageTimeToken,
];

if (stat.missingLimits) {
return {
text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.missingLimits', {
defaultMessage: `Kibana is configured for containerized workloads but node #start_link{nodeName}#end_link does not have resource limits configured. Fallback metric reports usage of {cpuUsage}%. Last checked at #absolute`,
values: {
nodeName: stat.nodeName,
cpuUsage: numeral(stat.cpuUsage).format(ROUNDED_FLOAT),
},
}),
tokens,
};
}

if (stat.unexpectedLimits) {
return {
text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.unexpectedLimits', {
Expand Down Expand Up @@ -273,12 +255,7 @@ export class CpuUsageRule extends BaseRule {
private getMessage(state: AlertCpuUsageState, clusterName: string, action: string) {
const stat = state.meta as AlertCpuUsageNodeStats;

if (
stat.missingLimits ||
stat.limitsChanged ||
stat.unexpectedLimits ||
stat.cpuUsage === undefined
) {
if (stat.limitsChanged || stat.unexpectedLimits || stat.cpuUsage === undefined) {
return i18n.translate('xpack.monitoring.alerts.cpuUsage.firing.internalMessageForFailure', {
defaultMessage: `CPU usage alert for node {nodeName} in cluster {clusterName} faced issues while evaluating the usage. {action}`,
values: {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,10 @@ describe('fetchCpuUsageNodeStats', () => {
value: 45,
},
quota_micros_max: {
value: -1,
value: 2000,
},
quota_micros_min: {
value: -1,
value: 2000,
},
name: {
buckets: [
Expand Down Expand Up @@ -366,7 +366,6 @@ describe('fetchCpuUsageNodeStats', () => {

expect(stats).toEqual([
{
missingLimits: true,
clusterUuid: 'my-test-cluster',
nodeId: 'my-test-node',
nodeName: 'test-node',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,14 +179,17 @@ async function fetchContainerStats(
}

const limitsNotSet = node.quota_micros_max.value === -1 && node.quota_micros_min.value === -1;
const notRunningInAContainer =
node.quota_micros_min.value === null && node.quota_micros_max.value === null;
if (limitsNotSet || notRunningInAContainer) {
if (limitsNotSet) {
miltonhultgren marked this conversation as resolved.
Show resolved Hide resolved
const cpuUsage = node.average_cpu_usage_percent.value ?? undefined;

logger.warn(
`CPU usage rule: Node "${node.key}" does not have resource limits configured. Fallback metric reports usage of ${cpuUsage}%.`
);

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm having real doubts about if we should log something or not.

On the one hand, this looks to me as a missed configuration but maybe it's more common than I think?
And then, having this logged every time the rule executes, possibly for many nodes, feels like a risk for spam which will make the Kibana log grow which means more cost of storing that log for no real gain and the customer might be fine with it.

So maybe we should make it a debug level instead? But, if we do, is it really useful? Will it help customers notice something is wrong.
I don't know if users really look at the Kibana logs anyway, similar for the log we make below about values missing?

Maybe both of those cases should just rely on the fallback metric silently?

I just feel like in the last PR, I included more changes than needed for the bug fix.
I think the limits change part is good because that's a real cause for bugs in the calculation.

But the other two cases should probably just silently return the base CPU metric since I'm not sure if either really helps the user find issues?

So my vote is to remove the logging here, and combine this check with the check below about if they are null, and not report any special status about that. What do you think?
That would bring the total set of changes closer to fixing only the original problem and removes a bunch of noise from the logs.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

const limitsNotSet = node.quota_micros_max.value === -1 && node.quota_micros_min.value === -1;
// This check was mostly for the compiler to be happy about the types anyway
const statsMissing =  node.max_usage_nanos.value === null &&
        node.min_usage_nanos.value === null &&
        node.max_periods.value === null &&
        node.min_periods.value === null &&
        node.quota_micros_max.value === null;

if (limitsNotSet || statsMissing) {
  const cpuUsage = node.average_cpu_usage_percent.value ?? undefined;
  return {
  clusterUuid: cluster.key as string,
  nodeId: node.key as string,
  cpuUsage,
  nodeName,
  ccs,
  };
}

Then after we check if the values are different or not.

return {
missingLimits: true,
clusterUuid: cluster.key as string,
nodeId: node.key as string,
cpuUsage: node.average_cpu_usage_percent.value ?? undefined,
cpuUsage,
nodeName,
ccs,
};
Expand Down Expand Up @@ -380,16 +383,17 @@ async function fetchNonContainerStats(
ccs = index.includes(':') ? index.split(':')[0] : undefined;
}

const runningInAContainer =
node.quota_micros_min.value !== null || node.quota_micros_max.value !== null;
const runningInAContainerWithLimits =
miltonhultgren marked this conversation as resolved.
Show resolved Hide resolved
(node.quota_micros_min.value !== null && node.quota_micros_min.value !== -1) ||
(node.quota_micros_max.value !== null && node.quota_micros_max.value !== -1);

return {
clusterUuid: cluster.key as string,
nodeId: node.key as string,
cpuUsage: node.average_cpu.value ?? undefined,
nodeName,
ccs,
unexpectedLimits: runningInAContainer,
unexpectedLimits: runningInAContainerWithLimits,
};
});
});
Expand Down