From 0d4e4b1e475959bce458af06572cfce4187d7797 Mon Sep 17 00:00:00 2001 From: Ram Ramrakhya Date: Tue, 19 Oct 2021 19:06:20 -0400 Subject: [PATCH] Enable submission worker metrics and fix node exporter metrics (#3618) --- apps/challenges/aws_utils.py | 2 ++ apps/challenges/task_definitions.py | 25 +++++++++++++++++++++++- docker/prod/nodejs/nginx_production.conf | 4 ++++ monitoring/prometheus/rules.yml | 2 +- 4 files changed, 31 insertions(+), 2 deletions(-) diff --git a/apps/challenges/aws_utils.py b/apps/challenges/aws_utils.py index 0a8b2b03bb..20ef297fc8 100644 --- a/apps/challenges/aws_utils.py +++ b/apps/challenges/aws_utils.py @@ -84,6 +84,8 @@ "RDS_PORT": settings.DATABASES["default"]["PORT"], "SECRET_KEY": settings.SECRET_KEY, "SENTRY_URL": os.environ.get("SENTRY_URL"), + "STATSD_ENDPOINT": os.environ.get("STATSD_ENDPOINT"), + "STATSD_PORT": os.environ.get("STATSD_PORT"), } VPC_DICT = { diff --git a/apps/challenges/task_definitions.py b/apps/challenges/task_definitions.py index a737513594..f64ed4b77f 100644 --- a/apps/challenges/task_definitions.py +++ b/apps/challenges/task_definitions.py @@ -112,6 +112,14 @@ {{ "name": "AWS_SES_REGION_ENDPOINT", "value": "{AWS_SES_REGION_ENDPOINT}" + }}, + {{ + "name": "STATSD_ENDPOINT", + "value: "{STATSD_ENDPOINT}" + }}, + {{ + "name": "STATSD_PORT", + "value: "{STATSD_PORT}" }} ], "workingDirectory": "/code", @@ -346,6 +354,14 @@ {{ "name": "AWS_SES_REGION_ENDPOINT", "value": "{AWS_SES_REGION_ENDPOINT}" + }}, + {{ + "name": "STATSD_ENDPOINT", + "value: "{STATSD_ENDPOINT}" + }}, + {{ + "name": "STATSD_PORT", + "value: "{STATSD_PORT}" }} ], "workingDirectory": "/code", @@ -414,10 +430,17 @@ "name": "EVALAI_DNS", "value": "{EVALAI_DNS}" }}, - {{ "name": "EFS_ID", "value": "{EFS_ID}" + }}, + {{ + "name": "STATSD_ENDPOINT", + "value: "{STATSD_ENDPOINT}" + }}, + {{ + "name": "STATSD_PORT", + "value: "{STATSD_PORT}" }} ], diff --git a/docker/prod/nodejs/nginx_production.conf b/docker/prod/nodejs/nginx_production.conf index 63120e2cc4..c360452ef9 100644 --- a/docker/prod/nodejs/nginx_production.conf +++ b/docker/prod/nodejs/nginx_production.conf @@ -2,6 +2,10 @@ upstream django_app { server django:8000 fail_timeout=0; } +upstream node_exporter { + server node_exporter:9100 fail_timeout=0; +} + server { server_name evalapi.cloudcv.org evalai.cloudcv.org; listen 80; diff --git a/monitoring/prometheus/rules.yml b/monitoring/prometheus/rules.yml index 5fc4f39825..b22cf36210 100644 --- a/monitoring/prometheus/rules.yml +++ b/monitoring/prometheus/rules.yml @@ -14,7 +14,7 @@ groups: - name: Instance-Status rules: - alert: InstanceDown - expr: up == 0 + expr: up{job="node_exporter"} == 0 for: 5m annotations: title: "Instance(s) Down"