Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Stack Monitoring] Update flows for cpu stats fetching #167244

Merged
merged 16 commits into from
Sep 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion x-pack/plugins/monitoring/common/types/alerts.ts
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,6 @@ export interface AlertNodeStats {
export interface AlertCpuUsageNodeStats extends AlertNodeStats {
cpuUsage?: number;
limitsChanged?: boolean;
missingLimits?: boolean;
unexpectedLimits?: boolean;
}

Expand Down
40 changes: 30 additions & 10 deletions x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -258,12 +258,7 @@ describe('CpuUsageRule', () => {

it('should fire actions when resource limits are missing', async () => {
(fetchCpuUsageNodeStats as jest.Mock).mockImplementation(() => {
return [
{
...stat,
missingLimits: true,
},
];
return [stat];
});

const rule = new CpuUsageRule();
Expand All @@ -287,14 +282,39 @@ describe('CpuUsageRule', () => {
nodeId,
nodeName,
threshold,
missingLimits: true,
},
nodeId,
nodeName,
ui: {
isFiring: true,
message: {
text: `Kibana is configured for containerized workloads but node #start_linkmyNodeName#end_link does not have resource limits configured. Fallback metric reports usage of ${cpuUsage}%. Last checked at #absolute`,
text: `Node #start_link${nodeName}#end_link is reporting CPU usage of ${cpuUsage}% which is above the configured threshold of ${threshold}%. Last checked at #absolute`,
nextSteps: [
{
text: '#start_linkCheck hot threads#end_link',
tokens: [
{
startToken: '#start_link',
endToken: '#end_link',
type: 'docLink',
partialUrl:
'{elasticWebsiteUrl}guide/en/elasticsearch/reference/{docLinkVersion}/cluster-nodes-hot-threads.html',
},
],
},
{
text: '#start_linkCheck long running tasks#end_link',
tokens: [
{
startToken: '#start_link',
endToken: '#end_link',
type: 'docLink',
partialUrl:
'{elasticWebsiteUrl}guide/en/elasticsearch/reference/{docLinkVersion}/tasks.html',
},
],
},
],
tokens: [
{
startToken: '#start_link',
Expand All @@ -319,8 +339,8 @@ describe('CpuUsageRule', () => {
],
});
expect(scheduleActions).toHaveBeenCalledWith('default', {
internalFullMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`,
internalShortMessage: `CPU usage alert for node ${nodeName} in cluster ${clusterName} faced issues while evaluating the usage. Verify CPU usage of node.`,
internalFullMessage: `CPU usage alert is firing for node ${nodeName} in cluster ${clusterName}. [View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`,
internalShortMessage: `CPU usage alert is firing for node ${nodeName} in cluster ${clusterName}. Verify CPU usage of node.`,
action: `[View node](http://localhost:5601/app/monitoring#/elasticsearch/nodes/${nodeId}?_g=(cluster_uuid:${clusterUuid}))`,
actionPlain: 'Verify CPU usage of node.',
clusterName,
Expand Down
27 changes: 2 additions & 25 deletions x-pack/plugins/monitoring/server/alerts/cpu_usage_rule.ts
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,7 @@ export class CpuUsageRule extends BaseRule {
stat: AlertCpuUsageNodeStats,
threshold: number
): { shouldFire: boolean; severity: AlertSeverity } {
if (
stat.missingLimits ||
stat.limitsChanged ||
stat.unexpectedLimits ||
stat.cpuUsage === undefined
) {
if (stat.limitsChanged || stat.unexpectedLimits || stat.cpuUsage === undefined) {
let severity = AlertSeverity.Warning;
if (stat.cpuUsage && stat.cpuUsage > threshold) {
severity = AlertSeverity.Danger;
Expand Down Expand Up @@ -149,19 +144,6 @@ export class CpuUsageRule extends BaseRule {
} as AlertMessageTimeToken,
];

if (stat.missingLimits) {
return {
text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.missingLimits', {
defaultMessage: `Kibana is configured for containerized workloads but node #start_link{nodeName}#end_link does not have resource limits configured. Fallback metric reports usage of {cpuUsage}%. Last checked at #absolute`,
values: {
nodeName: stat.nodeName,
cpuUsage: numeral(stat.cpuUsage).format(ROUNDED_FLOAT),
},
}),
tokens,
};
}

if (stat.unexpectedLimits) {
return {
text: i18n.translate('xpack.monitoring.alerts.cpuUsage.ui.unexpectedLimits', {
Expand Down Expand Up @@ -273,12 +255,7 @@ export class CpuUsageRule extends BaseRule {
private getMessage(state: AlertCpuUsageState, clusterName: string, action: string) {
const stat = state.meta as AlertCpuUsageNodeStats;

if (
stat.missingLimits ||
stat.limitsChanged ||
stat.unexpectedLimits ||
stat.cpuUsage === undefined
) {
if (stat.limitsChanged || stat.unexpectedLimits || stat.cpuUsage === undefined) {
return i18n.translate('xpack.monitoring.alerts.cpuUsage.firing.internalMessageForFailure', {
defaultMessage: `CPU usage alert for node {nodeName} in cluster {clusterName} faced issues while evaluating the usage. {action}`,
values: {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,10 @@ describe('fetchCpuUsageNodeStats', () => {
value: 45,
},
quota_micros_max: {
value: -1,
value: 2000,
},
quota_micros_min: {
value: -1,
value: 2000,
},
name: {
buckets: [
Expand Down Expand Up @@ -366,7 +366,6 @@ describe('fetchCpuUsageNodeStats', () => {

expect(stats).toEqual([
{
missingLimits: true,
clusterUuid: 'my-test-cluster',
nodeId: 'my-test-node',
nodeName: 'test-node',
Expand Down Expand Up @@ -454,83 +453,5 @@ describe('fetchCpuUsageNodeStats', () => {
},
]);
});

it('warns about failing to compute usage due to values missing', async () => {
esClient.search.mockResponse({
aggregations: {
clusters: {
buckets: [
{
key: 'my-test-cluster',
nodes: {
buckets: [
{
key: 'my-test-node',
min_usage_nanos: {
value: null,
},
max_usage_nanos: {
value: null,
},
min_periods: {
value: null,
},
max_periods: {
value: null,
},
quota_micros_min: {
value: 10000,
},
quota_micros_max: {
value: 10000,
},
average_cpu_usage_percent: {
value: 45,
},
name: {
buckets: [
{
key: 'test-node',
},
],
},
index: {
buckets: [
{
key: 'a-local-index',
},
],
},
},
],
},
},
],
},
},
} as any);

const stats = await fetchCpuUsageNodeStats(
{
esClient,
clusterUuids: ['my-test-cluster'],
startMs: 0,
endMs: 10,
filterQuery,
logger: loggerMock.create(),
},
configSlice
);

expect(stats).toEqual([
{
clusterUuid: 'my-test-cluster',
nodeId: 'my-test-node',
nodeName: 'test-node',
ccs: undefined,
cpuUsage: undefined,
},
]);
});
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,6 @@ import { MonitoringConfig } from '../../config';
import { getElasticsearchDataset, getIndexPatterns } from '../cluster/get_index_patterns';
import { createDatasetFilter } from './create_dataset_query_filter';

interface CpuUsageFieldsWithValues {
'max of node_stats.os.cgroup.cpu.cfs_quota_micros': number | null;
'max of node_stats.os.cgroup.cpuacct.usage_nanos': number | null;
'min of node_stats.os.cgroup.cpuacct.usage_nanos': number | null;
'max of node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods': number | null;
'min of node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods': number | null;
}

interface Options {
esClient: ElasticsearchClient;
clusterUuids: string[];
Expand All @@ -45,7 +37,7 @@ export async function fetchCpuUsageNodeStats(
}

async function fetchContainerStats(
{ esClient, startMs, endMs, clusterUuids, filterQuery, logger }: Options,
{ esClient, startMs, endMs, clusterUuids, filterQuery }: Options,
config: MonitoringConfig
) {
const indexPatterns = getIndexPatterns({
Expand Down Expand Up @@ -178,58 +170,34 @@ async function fetchContainerStats(
ccs = index.includes(':') ? index.split(':')[0] : undefined;
}

const limitsNotSet = node.quota_micros_max.value === -1 && node.quota_micros_min.value === -1;
const notRunningInAContainer =
node.quota_micros_min.value === null && node.quota_micros_max.value === null;
if (limitsNotSet || notRunningInAContainer) {
return {
missingLimits: true,
clusterUuid: cluster.key as string,
nodeId: node.key as string,
cpuUsage: node.average_cpu_usage_percent.value ?? undefined,
nodeName,
ccs,
};
}
const nodeStats = {
clusterUuid: cluster.key as string,
nodeId: node.key as string,
nodeName,
ccs,
};

if (node.quota_micros_min.value !== node.quota_micros_max.value) {
return {
limitsChanged: true,
clusterUuid: cluster.key as string,
nodeId: node.key as string,
cpuUsage: undefined,
nodeName,
ccs,
};
}
const limitsNotSet = node.quota_micros_max.value === -1 && node.quota_micros_min.value === -1;

if (
limitsNotSet ||
node.max_usage_nanos.value === null ||
node.min_usage_nanos.value === null ||
node.max_periods.value === null ||
node.min_periods.value === null ||
node.quota_micros_max.value === null
) {
logger.warn(
`CPU usage rule: Some aggregated values needed for container CPU usage calculation was empty: ${findEmptyValues(
{
'max of node_stats.os.cgroup.cpu.cfs_quota_micros': node.quota_micros_max.value,
'max of node_stats.os.cgroup.cpuacct.usage_nanos': node.max_usage_nanos.value,
'min of node_stats.os.cgroup.cpuacct.usage_nanos': node.min_usage_nanos.value,
'max of node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods':
node.max_periods.value,
'min of node_stats.os.cgroup.cpu.stat.number_of_elapsed_periods':
node.min_periods.value,
}
)}`
);
return {
...nodeStats,
cpuUsage: node.average_cpu_usage_percent.value ?? undefined,
};
}

if (node.quota_micros_min.value !== node.quota_micros_max.value) {
return {
clusterUuid: cluster.key as string,
nodeId: node.key as string,
...nodeStats,
limitsChanged: true,
cpuUsage: undefined,
nodeName,
ccs,
};
}

Expand All @@ -243,24 +211,13 @@ async function fetchContainerStats(
);

return {
clusterUuid: cluster.key as string,
nodeId: node.key as string,
...nodeStats,
cpuUsage: Math.round(cpuUsage * 100) / 100,
nodeName,
ccs,
};
});
});
}

function findEmptyValues(fieldsWithValues: CpuUsageFieldsWithValues): string {
const entries: Array<[string, number | null]> = Object.entries(fieldsWithValues);
return entries
.filter(([, value]) => value === null)
.map(([key]) => key)
.join(', ');
}

function computeCfsPercentCpuUsage(usageNanos: number, quotaMicros: number, periods: number) {
// See https://github.com/elastic/kibana/pull/159351 for an explanation of this formula
const quotaNanos = quotaMicros * 1000;
Expand Down Expand Up @@ -380,16 +337,17 @@ async function fetchNonContainerStats(
ccs = index.includes(':') ? index.split(':')[0] : undefined;
}

const runningInAContainer =
node.quota_micros_min.value !== null || node.quota_micros_max.value !== null;
const runningInAContainerWithLimits =
miltonhultgren marked this conversation as resolved.
Show resolved Hide resolved
(node.quota_micros_min.value !== null && node.quota_micros_min.value !== -1) ||
(node.quota_micros_max.value !== null && node.quota_micros_max.value !== -1);

return {
clusterUuid: cluster.key as string,
nodeId: node.key as string,
cpuUsage: node.average_cpu.value ?? undefined,
nodeName,
ccs,
unexpectedLimits: runningInAContainer,
unexpectedLimits: runningInAContainerWithLimits,
};
});
});
Expand Down
Loading