Skip to content

Commit

Permalink
chore: Update Error Alarms (#783)
Browse files Browse the repository at this point in the history
* Update monitoring.ts

* Update monitoring.test.ts.snap

* Update monitoring.ts

* Update monitoring.test.ts.snap

* Update monitoring.ts

* Update monitoring.test.ts.snap

* Update monitoring.ts

* Update monitoring.ts

* Update monitoring.test.ts.snap

* Updated dataPoints to alarm

* Update monitoring.ts

* Update monitoring.test.ts.snap

* Update monitoring.ts

* Update monitoring.test.ts.snap

* Using canary

* Update metric

* Update threshold

* Updating alarm

* Update monitoring.ts

* Update monitoring.test.ts.snap

* Update monitoring.ts

* Update monitoring.test.ts.snap

* Update monitoring.ts

* Update monitoring.test.ts.snap

* Only create alarm action in PROD

* Removed codeDurationInDays variable
  • Loading branch information
akinsola-guardian authored Jan 15, 2024
1 parent 100b0a6 commit 56627c9
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 10 deletions.
53 changes: 50 additions & 3 deletions cdk/lib/__snapshots__/monitoring.test.ts.snap
Original file line number Diff line number Diff line change
Expand Up @@ -232,8 +232,16 @@ exports[`The Monitoring stack matches the snapshot 1`] = `
},
"cmpmonitoringalarms68497862": {
"Properties": {
"AlarmDescription": "Alarm if the SUM of Errors is greater than or equal to the threshold (1) for 1 evaluation period",
"ActionsEnabled": true,
"AlarmActions": [
{
"Ref": "internalEmailRecipientA4594AC9",
},
],
"AlarmDescription": "This alarm is triggered if 4 out of 5 lambda executions fail in eu-west-1",
"AlarmName": "CMP Monitoring - PROD - eu-west-1",
"ComparisonOperator": "GreaterThanOrEqualToThreshold",
"DatapointsToAlarm": 4,
"Dimensions": [
{
"Name": "FunctionName",
Expand All @@ -242,12 +250,18 @@ exports[`The Monitoring stack matches the snapshot 1`] = `
},
},
],
"EvaluationPeriods": 1,
"EvaluationPeriods": 5,
"MetricName": "Errors",
"Namespace": "AWS/Lambda",
"Period": 60,
"OKActions": [
{
"Ref": "internalEmailRecipientA4594AC9",
},
],
"Period": 120,
"Statistic": "Sum",
"Threshold": 1,
"TreatMissingData": "notBreaching",
},
"Type": "AWS::CloudWatch::Alarm",
},
Expand Down Expand Up @@ -289,6 +303,39 @@ exports[`The Monitoring stack matches the snapshot 1`] = `
},
"Type": "AWS::Events::Rule",
},
"internalEmailRecipientA4594AC9": {
"Properties": {
"Tags": [
{
"Key": "gu:cdk:version",
"Value": "TEST",
},
{
"Key": "gu:repo",
"Value": "guardian/consent-management-platform",
},
{
"Key": "Stack",
"Value": "cmp-monitoring",
},
{
"Key": "Stage",
"Value": "PROD",
},
],
},
"Type": "AWS::SNS::Topic",
},
"internalEmailRecipienttransparencyandconsentguardiancoukAC07DD37": {
"Properties": {
"Endpoint": "[email protected]",
"Protocol": "email",
"TopicArn": {
"Ref": "internalEmailRecipientA4594AC9",
},
},
"Type": "AWS::SNS::Subscription",
},
},
}
`;
43 changes: 36 additions & 7 deletions cdk/lib/monitoring.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,22 @@ import { GuStack } from '@guardian/cdk/lib/constructs/core';
import { GuLambdaFunction } from '@guardian/cdk/lib/constructs/lambda';
import type { App } from 'aws-cdk-lib';
import { Duration } from 'aws-cdk-lib';
import type {
IAlarmAction} from 'aws-cdk-lib/aws-cloudwatch';
import {
Alarm,
ComparisonOperator,
Metric,
Unit,
TreatMissingData,
Unit
} from 'aws-cdk-lib/aws-cloudwatch';
import { SnsAction } from 'aws-cdk-lib/aws-cloudwatch-actions';
import { Rule, RuleTargetInput, Schedule } from 'aws-cdk-lib/aws-events';
import { LambdaFunction } from 'aws-cdk-lib/aws-events-targets';
import { Effect, PolicyStatement } from 'aws-cdk-lib/aws-iam';
import { Runtime } from 'aws-cdk-lib/aws-lambda';
import { Topic } from 'aws-cdk-lib/aws-sns';
import { EmailSubscription } from 'aws-cdk-lib/aws-sns-subscriptions';

export class Monitoring extends GuStack {
constructor(scope: App, id: string, props: GuStackProps) {
Expand All @@ -24,6 +30,8 @@ export class Monitoring extends GuStack {

const lambdaBaseName = 'cmp-monitoring';

const prodDurationInMinutes = 2;

const policyStatement = new PolicyStatement({
effect: Effect.ALLOW,
actions: ['cloudwatch:PutMetricData'],
Expand Down Expand Up @@ -55,12 +63,12 @@ export class Monitoring extends GuStack {

// Defining metric for lambda errors each minute
const errorMetric = monitoringLambdaFunction.metricErrors({
period: Duration.minutes(1),
period: Duration.minutes(prodDurationInMinutes),
});

// Defining metric for lambda errors each minute
monitoringLambdaFunction.metricInvocations({
period: Duration.minutes(1),
period: Duration.minutes(prodDurationInMinutes),
});

const lambdaEventTarget = new LambdaFunction(monitoringLambdaFunction, {
Expand All @@ -70,23 +78,44 @@ export class Monitoring extends GuStack {
}),
});


const monitoringDuration: Duration =
stage === 'PROD' ? Duration.minutes(2) : Duration.days(1); // Every day for CODE; Every 2 minutes for PROD.
stage === 'PROD' ? Duration.minutes(prodDurationInMinutes) : Duration.days(1); // Every day for CODE; Every 2 minutes for PROD.

new Rule(this, 'cmp monitoring schedule', {
schedule: Schedule.rate(monitoringDuration),
targets: [lambdaEventTarget],
});


// Error Alarm
new Alarm(this, 'cmp-monitoring-alarms', {
const alarm = new Alarm(this, 'cmp-monitoring-alarms', {
comparisonOperator:
ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
threshold: 1,
evaluationPeriods: 1,
evaluationPeriods: 5, // This value is the number of periods to watch. Here, we're evaluating 5 executions of the lambda. The lambda is triggered every 2minutes so it's check over a 10 minute timeframe.
actionsEnabled: true,
datapointsToAlarm: 4, // This value is the number of failed data-points/executions that will trigger the alarm. so 4 out of 5
treatMissingData: TreatMissingData.NOT_BREACHING,
metric: errorMetric,
alarmName: `CMP Monitoring - ${stage} - ${region}`,
alarmDescription:
'Alarm if the SUM of Errors is greater than or equal to the threshold (1) for 1 evaluation period',
`This alarm is triggered if 4 out of 5 lambda executions fail in ${region}`,
});

if(this.stage === "PROD"){
const emailSubscription = new EmailSubscription(
"[email protected]"
);

const internalEmailMessaging = new Topic(this, "internalEmailRecipient");
internalEmailMessaging.addSubscription(emailSubscription);

const alarmAction: IAlarmAction = new SnsAction(internalEmailMessaging);

alarm.addAlarmAction(alarmAction)
alarm.addOkAction(alarmAction)
}

}
}

1 comment on commit 56627c9

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage report

St.
Category Percentage Covered / Total
🟢 Statements 92.45% 245/265
🟢 Branches 83.19% 94/113
🟢 Functions 90% 63/70
🟢 Lines 92.19% 236/256

Test suite run success

328 tests passing in 16 suites.

Report generated by 🧪jest coverage report action from 56627c9

Please sign in to comment.