Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Monitoring] Add monitoring setup production environment #3602

Merged
merged 3 commits into from
Oct 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions docker-compose-production.yml
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,80 @@ services:
context: ./
dockerfile: docker/prod/code-upload-worker/Dockerfile

prometheus:
hostname: prometheus
image: prom/prometheus:latest
user: "1000"
volumes:
- ./monitoring/prometheus/prometheus_production.yml:/etc/prometheus/prometheus.yml
- ./monitoring/prometheus/rules.yml:/etc/rules/rules.yml
- ./monitoring/prometheus/prometheus_db:/var/lib/prometheus
- ./monitoring/prometheus/prometheus_db:/prometheus
- ./monitoring/prometheus/prometheus_db:/etc/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--web.external-url=http://localhost:9090/prometheus'
ports:
- '9090:9090'

grafana:
hostname: grafana
image: grafana/grafana:latest
user: "1000"
env_file:
- docker/prod/docker_production.env
volumes:
- ./monitoring/grafana/grafana_db:/var/lib/grafana
depends_on:
- prometheus
ports:
- '3000:3000'

statsd-exporter:
hostname: statsd
image: prom/statsd-exporter:latest
command:
- '--log.level=info'
- '--web.telemetry-path=/statsd/metrics'
ports:
- '9125:9125/udp'
- '9125:9125/tcp'
- '9102:9102'

node_exporter:
hostname: node_exporter
image: prom/node-exporter
ports:
- '9100:9100'

nginx-ingress:
image: ${AWS_ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/evalai-production-nginx-ingress:${COMMIT_ID}
build:
context: ./
dockerfile: docker/prod/nginx-ingress/Dockerfile
args:
MONITORING_ENV: production
depends_on:
- prometheus
- grafana
- statsd-exporter
- alert-manager
ports:
- '80:80'
- '443:443'

alert-manager:
hostname: alert_manager
image: prom/alertmanager
user: "1000"
volumes:
- ./monitoring/prometheus:/prometheus
- ./monitoring/alertmanager/data:/data
- ./monitoring/alertmanager/templates:/etc/alertmanager/templates
command:
- '--config.file=/prometheus/alert_manager.yml'
- '--storage.path=/data'
- '--web.external-url=http://localhost:9093/alert_manager'
ports:
- '9093:9093'

68 changes: 68 additions & 0 deletions docker/prod/nginx-ingress/nginx_production.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
upstream prometheus {
server prometheus:9090 fail_timeout=0;
}

upstream grafana {
server grafana:3000 fail_timeout=0;
}

upstream statsd_exporter {
server statsd:9102 fail_timeout=0;
}

upstream alert_manager {
server alert_manager:9093 fail_timeout=0;
}

server {
server_name monitoring.eval.ai;
listen 80;
return 301 https://monitoring.eval.ai$request_uri;
}

server {
server_name monitoring.eval.ai;
listen 443 ssl;
location / {
root /usr/share/nginx/html;
index index.html index.htm;
}

ssl on;
ssl_certificate /etc/ssl/eval_ai.crt;
ssl_certificate_key /etc/ssl/eval_ai.key;
ssl_prefer_server_ciphers on;
# enables all versions of TLS, but not SSLv2 or 3 which are weak and now deprecated.
ssl_protocols TLSv1 TLSv1.1 TLSv1.2;

access_log /var/log/nginx/access.log;
error_log /var/log/nginx/error.log;

location /prometheus {
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header Host $host;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_pass http://prometheus;
}

location /grafana {
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header Host $host;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_pass http://grafana;
}

location /statsd {
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header Host $host;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_pass http://statsd_exporter;
}

location /alert_manager {
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header Host $host;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_pass http://alert_manager;
}
}
30 changes: 30 additions & 0 deletions monitoring/prometheus/prometheus_production.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
global:
scrape_interval: 30s
external_labels:
monitor: 'evalai-monitor'

rule_files:
- /etc/rules/rules.yml

scrape_configs:
- job_name: 'prometheus'
metrics_path: '/prometheus/metrics'
static_configs:
- targets: ['localhost:9090']

- job_name: 'statsd'
metrics_path: '/statsd/metrics'
static_configs:
- targets: ['monitoring.eval.ai']

- job_name: 'node_exporter'
metrics_path: '/node_exporter'
static_configs:
- targets: ['eval.ai']

alerting:
alertmanagers:
- path_prefix: '/alert_manager'
scheme: 'https'
static_configs:
- targets: ['monitoring.eval.ai']
1 change: 1 addition & 0 deletions requirements/common.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ pickleshare==0.7.5
Pillow==7.1.0
psycopg2==2.8.4
pycurl==7.43.0.6
PyJWT==2.1.0
PyYaml==5.1
proc==1.0
rstr==2.2.6
Expand Down
4 changes: 2 additions & 2 deletions scripts/deployment/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ case $opt in
eval $(aws ecr get-login --no-include-email)
aws s3 cp s3://cloudcv-secrets/evalai/${env}/docker_${env}.env ./docker/prod/docker_${env}.env
docker-compose -f docker-compose-${env}.yml rm -s -v -f
docker-compose -f docker-compose-${env}.yml pull django nodejs celery
docker-compose -f docker-compose-${env}.yml up -d --force-recreate --remove-orphans django nodejs nodejs_v2 celery
docker-compose -f docker-compose-${env}.yml pull django nodejs celery nodejs_v2 node-exporter
docker-compose -f docker-compose-${env}.yml up -d --force-recreate --remove-orphans django nodejs nodejs_v2 celery node-exporter
ENDSSH2
ssh ubuntu@${MONITORING_INSTANCE} -o StrictHostKeyChecking=no AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID} COMMIT_ID=${COMMIT_ID} env=${env} 'bash -s' <<-'ENDSSH2'
source venv/bin/activate
Expand Down
1 change: 1 addition & 0 deletions settings/staging.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@
"https://staging-evalai.s3.amazonaws.com",
"https://staging.eval.ai",
"https://monitoring-staging.eval.ai",
"https://monitoring.eval.ai",
)