Skip to content

Commit

Permalink
Add a higher threshold alarm which we can use for known checkout fail…
Browse files Browse the repository at this point in the history
…ure reasons (#6749)

This PR adds a new higher threshold alarm/metric. In #6745 we stopped alarming
on the "email already taken" error from identity. It's a known issue and we
don't want to spend time investigating every time it happens. However, we would
like to know if it starts occurring at a higher volume. This PR adds the
concept of a higher threshold alarm/metric which we can use to cover this case.
The alarm will trigger if 10 or more minutes in the last hour have errors.
  • Loading branch information
tjmw authored Jan 31, 2025
1 parent e4e46c3 commit dbf6953
Show file tree
Hide file tree
Showing 4 changed files with 118 additions and 6 deletions.
60 changes: 60 additions & 0 deletions cdk/lib/__snapshots__/frontend.test.ts.snap

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

27 changes: 27 additions & 0 deletions cdk/lib/frontend.ts
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,33 @@ export class Frontend extends GuStack {
treatMissingData: TreatMissingData.NOT_BREACHING,
snsTopicName: `alarms-handler-topic-${this.stage}`,
});

new GuAlarm(this, "ServerSideHighThresholdCreateFailureAlarm", {
app,
alarmName: alarmName(
"support-frontend create recurring product call failed multiple times for a known reason"
),
alarmDescription: alarmDescription(
"Someone pressed buy on a recurring product but received an error. This has happened multiple times for a known reason."
),
actionsEnabled: shouldCreateAlarms,
threshold: 1,
evaluationPeriods: 60,
datapointsToAlarm: 10,
comparisonOperator:
ComparisonOperator.GREATER_THAN_OR_EQUAL_TO_THRESHOLD,
metric: new Metric({
metricName: "ServerSideHighThresholdCreateFailure",
namespace: "support-frontend",
dimensionsMap: {
Stage: this.stage,
},
statistic: "Sum",
period: Duration.minutes(1),
}),
treatMissingData: TreatMissingData.NOT_BREACHING,
snsTopicName: `alarms-handler-topic-${this.stage}`,
});
}
}
}
29 changes: 23 additions & 6 deletions support-frontend/app/actions/CustomActionBuilders.scala
Original file line number Diff line number Diff line change
Expand Up @@ -62,28 +62,45 @@ class CustomActionBuilders(

case class LoggingAndAlarmOnFailure[A](chainedAction: Action[A]) extends EssentialAction with SafeLogging {

private def pushMetric(cloudwatchEvent: AwsCloudWatchMetricPut.MetricRequest) = {
AwsCloudWatchMetricPut(AwsCloudWatchMetricPut.client)(cloudwatchEvent)
}
private def pushAlarmMetric = {
val cloudwatchEvent = AwsCloudWatchMetricSetup.serverSideCreateFailure(stage)
AwsCloudWatchMetricPut(AwsCloudWatchMetricPut.client)(cloudwatchEvent)
pushMetric(cloudwatchEvent)
}

private def pushHighThresholdAlarmMetric = {
val cloudwatchEvent = AwsCloudWatchMetricSetup.serverSideHighThresholdCreateFailure(stage)
pushMetric(cloudwatchEvent)
}

private def maybePushAlarmMetric(result: Result) = {
// We'll never alarm on these
val ignoreList = Set(
emailProviderRejectedCode,
invalidEmailAddressCode,
recaptchaFailedCode,
)
// We'll alarm on these, but only over a certain threshold
val highThresholdList = Set(
emailAddressAlreadyTakenCode,
)
if (result.header.status == 500) {
if (!ignoreList.contains(result.header.reasonPhrase.getOrElse(""))) {
if (ignoreList.contains(result.header.reasonPhrase.getOrElse(""))) {
logger.info(
s"not pushing alarm metric for ${result.header.status} ${result.header.reasonPhrase} as it is in our ignore list",
)
} else if (highThresholdList.contains(result.header.reasonPhrase.getOrElse(""))) {
logger.info(
s"pushing higher threshold alarm metric for ${result.header.status} ${result.header.reasonPhrase}",
)
pushHighThresholdAlarmMetric
} else {
logger.error(
scrub"pushing alarm metric - non 2xx response. Http code: ${result.header.status}, reason: ${result.header.reasonPhrase}",
)
pushAlarmMetric
} else {
logger.info(
s"not pushing alarm metric for ${result.header.status} ${result.header.reasonPhrase} as it is in our ignore list",
)
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,14 @@ object AwsCloudWatchMetricSetup {
),
)

def serverSideHighThresholdCreateFailure(stage: Stage): MetricRequest =
getMetricRequest(
MetricName("ServerSideHighThresholdCreateFailure"),
Map(
MetricDimensionName("Stage") -> MetricDimensionValue(stage.toString),
),
)

def defaultPromotionsLoadingFailure(stage: Stage): MetricRequest =
getMetricRequest(
MetricName("DefaultPromotionsLoadingFailure"),
Expand Down

0 comments on commit dbf6953

Please sign in to comment.