diff --git a/docker-compose-production.yml b/docker-compose-production.yml index fb0d4659e2..0d0d48f1f8 100644 --- a/docker-compose-production.yml +++ b/docker-compose-production.yml @@ -77,3 +77,80 @@ services: context: ./ dockerfile: docker/prod/code-upload-worker/Dockerfile + prometheus: + hostname: prometheus + image: prom/prometheus:latest + user: "1000" + volumes: + - ./monitoring/prometheus/prometheus_production.yml:/etc/prometheus/prometheus.yml + - ./monitoring/prometheus/rules.yml:/etc/rules/rules.yml + - ./monitoring/prometheus/prometheus_db:/var/lib/prometheus + - ./monitoring/prometheus/prometheus_db:/prometheus + - ./monitoring/prometheus/prometheus_db:/etc/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--web.external-url=http://localhost:9090/prometheus' + ports: + - '9090:9090' + + grafana: + hostname: grafana + image: grafana/grafana:latest + user: "1000" + env_file: + - docker/prod/docker_production.env + volumes: + - ./monitoring/grafana/grafana_db:/var/lib/grafana + depends_on: + - prometheus + ports: + - '3000:3000' + + statsd-exporter: + hostname: statsd + image: prom/statsd-exporter:latest + command: + - '--log.level=info' + - '--web.telemetry-path=/statsd/metrics' + ports: + - '9125:9125/udp' + - '9125:9125/tcp' + - '9102:9102' + + node_exporter: + hostname: node_exporter + image: prom/node-exporter + ports: + - '9100:9100' + + nginx-ingress: + image: ${AWS_ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/evalai-production-nginx-ingress:${COMMIT_ID} + build: + context: ./ + dockerfile: docker/prod/nginx-ingress/Dockerfile + args: + MONITORING_ENV: production + depends_on: + - prometheus + - grafana + - statsd-exporter + - alert-manager + ports: + - '80:80' + - '443:443' + + alert-manager: + hostname: alert_manager + image: prom/alertmanager + user: "1000" + volumes: + - ./monitoring/prometheus:/prometheus + - ./monitoring/alertmanager/data:/data + - ./monitoring/alertmanager/templates:/etc/alertmanager/templates + command: + - '--config.file=/prometheus/alert_manager.yml' + - '--storage.path=/data' + - '--web.external-url=http://localhost:9093/alert_manager' + ports: + - '9093:9093' + diff --git a/docker/prod/nginx-ingress/nginx_production.conf b/docker/prod/nginx-ingress/nginx_production.conf new file mode 100644 index 0000000000..0f3c2c5519 --- /dev/null +++ b/docker/prod/nginx-ingress/nginx_production.conf @@ -0,0 +1,68 @@ +upstream prometheus { + server prometheus:9090 fail_timeout=0; +} + +upstream grafana { + server grafana:3000 fail_timeout=0; +} + +upstream statsd_exporter { + server statsd:9102 fail_timeout=0; +} + +upstream alert_manager { + server alert_manager:9093 fail_timeout=0; +} + +server { + server_name monitoring.eval.ai; + listen 80; + return 301 https://monitoring.eval.ai$request_uri; +} + +server { + server_name monitoring.eval.ai; + listen 443 ssl; + location / { + root /usr/share/nginx/html; + index index.html index.htm; + } + + ssl on; + ssl_certificate /etc/ssl/eval_ai.crt; + ssl_certificate_key /etc/ssl/eval_ai.key; + ssl_prefer_server_ciphers on; + # enables all versions of TLS, but not SSLv2 or 3 which are weak and now deprecated. + ssl_protocols TLSv1 TLSv1.1 TLSv1.2; + + access_log /var/log/nginx/access.log; + error_log /var/log/nginx/error.log; + + location /prometheus { + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header Host $host; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_pass http://prometheus; + } + + location /grafana { + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header Host $host; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_pass http://grafana; + } + + location /statsd { + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header Host $host; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_pass http://statsd_exporter; + } + + location /alert_manager { + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header Host $host; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_pass http://alert_manager; + } +} diff --git a/monitoring/prometheus/prometheus_production.yml b/monitoring/prometheus/prometheus_production.yml new file mode 100644 index 0000000000..2761ea0c36 --- /dev/null +++ b/monitoring/prometheus/prometheus_production.yml @@ -0,0 +1,30 @@ +global: + scrape_interval: 30s + external_labels: + monitor: 'evalai-monitor' + +rule_files: + - /etc/rules/rules.yml + +scrape_configs: + - job_name: 'prometheus' + metrics_path: '/prometheus/metrics' + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'statsd' + metrics_path: '/statsd/metrics' + static_configs: + - targets: ['monitoring.eval.ai'] + + - job_name: 'node_exporter' + metrics_path: '/node_exporter' + static_configs: + - targets: ['eval.ai'] + +alerting: + alertmanagers: + - path_prefix: '/alert_manager' + scheme: 'https' + static_configs: + - targets: ['monitoring.eval.ai'] diff --git a/requirements/common.txt b/requirements/common.txt index 8f483f62fd..dd58d292c5 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -24,6 +24,7 @@ pickleshare==0.7.5 Pillow==7.1.0 psycopg2==2.8.4 pycurl==7.43.0.6 +PyJWT==2.1.0 PyYaml==5.1 proc==1.0 rstr==2.2.6 diff --git a/scripts/deployment/deploy.sh b/scripts/deployment/deploy.sh index b56aa606ac..69a4864a55 100755 --- a/scripts/deployment/deploy.sh +++ b/scripts/deployment/deploy.sh @@ -49,8 +49,8 @@ case $opt in eval $(aws ecr get-login --no-include-email) aws s3 cp s3://cloudcv-secrets/evalai/${env}/docker_${env}.env ./docker/prod/docker_${env}.env docker-compose -f docker-compose-${env}.yml rm -s -v -f - docker-compose -f docker-compose-${env}.yml pull django nodejs celery - docker-compose -f docker-compose-${env}.yml up -d --force-recreate --remove-orphans django nodejs nodejs_v2 celery + docker-compose -f docker-compose-${env}.yml pull django nodejs celery nodejs_v2 node-exporter + docker-compose -f docker-compose-${env}.yml up -d --force-recreate --remove-orphans django nodejs nodejs_v2 celery node-exporter ENDSSH2 ssh ubuntu@${MONITORING_INSTANCE} -o StrictHostKeyChecking=no AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID} COMMIT_ID=${COMMIT_ID} env=${env} 'bash -s' <<-'ENDSSH2' source venv/bin/activate diff --git a/settings/staging.py b/settings/staging.py index dd6ab3b647..946802cba8 100644 --- a/settings/staging.py +++ b/settings/staging.py @@ -9,4 +9,5 @@ "https://staging-evalai.s3.amazonaws.com", "https://staging.eval.ai", "https://monitoring-staging.eval.ai", + "https://monitoring.eval.ai", )