diff --git a/docker-compose-production.yml b/docker-compose-production.yml index fb0d4659e2..a050ce60ae 100644 --- a/docker-compose-production.yml +++ b/docker-compose-production.yml @@ -77,3 +77,80 @@ services: context: ./ dockerfile: docker/prod/code-upload-worker/Dockerfile + prometheus: + hostname: prometheus + image: prom/prometheus:latest + user: "1000" + volumes: + - ./monitoring/prometheus/prometheus_production.yml:/etc/prometheus/prometheus.yml + - ./monitoring/prometheus/rules.yml:/etc/rules/rules.yml + - ./monitoring/prometheus/prometheus_db:/var/lib/prometheus + - ./monitoring/prometheus/prometheus_db:/prometheus + - ./monitoring/prometheus/prometheus_db:/etc/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--web.external-url=http://localhost:9090/prometheus' + ports: + - '9090:9090' + + grafana: + hostname: grafana + image: grafana/grafana:latest + user: "1000" + env_file: + - docker/prod/docker_production.env + volumes: + - ./monitoring/grafana/grafana_db:/var/lib/grafana + depends_on: + - prometheus + ports: + - '3000:3000' + + statsd-exporter: + hostname: statsd + image: prom/statsd-exporter:latest + command: + - '--log.level=info' + - '--web.telemetry-path=/statsd/metrics' + ports: + - '9125:9125/udp' + - '9125:9125/tcp' + - '9102:9102' + + node_exporter: + hostname: node_exporter + image: prom/node-exporter + ports: + - '9100:9100' + + nginx-ingress: + image: ${AWS_ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/evalai-production-nginx-ingress:${COMMIT_ID} + build: + context: ./ + dockerfile: docker/prod/nginx-ingress/Dockerfile + args: + MONITORING_ENV: production + depends_on: + - prometheus + - grafana + - statsd-exporter + - alert-manager + ports: + - '80:80' + - '443:443' + + alert-manager: + hostname: alert_manager + image: prom/alertmanager + user: "1000" + volumes: + - ./monitoring/prometheus:/prometheus + - ./monitoring/alertmanager/data:/data + - ./monitoring/alertmanager/templates:/etc/alertmanager/templates + command: + - '--config.file=/prometheus/alert_manager.yml' + - '--storage.path=/data' + - '--web.external-url=http://localhost:9093/alert_manager' + ports: + - '9093:9093' + diff --git a/docker/prod/nginx-ingress/nginx_production.conf b/docker/prod/nginx-ingress/nginx_production.conf new file mode 100644 index 0000000000..0f3c2c5519 --- /dev/null +++ b/docker/prod/nginx-ingress/nginx_production.conf @@ -0,0 +1,68 @@ +upstream prometheus { + server prometheus:9090 fail_timeout=0; +} + +upstream grafana { + server grafana:3000 fail_timeout=0; +} + +upstream statsd_exporter { + server statsd:9102 fail_timeout=0; +} + +upstream alert_manager { + server alert_manager:9093 fail_timeout=0; +} + +server { + server_name monitoring.eval.ai; + listen 80; + return 301 https://monitoring.eval.ai$request_uri; +} + +server { + server_name monitoring.eval.ai; + listen 443 ssl; + location / { + root /usr/share/nginx/html; + index index.html index.htm; + } + + ssl on; + ssl_certificate /etc/ssl/eval_ai.crt; + ssl_certificate_key /etc/ssl/eval_ai.key; + ssl_prefer_server_ciphers on; + # enables all versions of TLS, but not SSLv2 or 3 which are weak and now deprecated. + ssl_protocols TLSv1 TLSv1.1 TLSv1.2; + + access_log /var/log/nginx/access.log; + error_log /var/log/nginx/error.log; + + location /prometheus { + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header Host $host; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_pass http://prometheus; + } + + location /grafana { + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header Host $host; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_pass http://grafana; + } + + location /statsd { + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header Host $host; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_pass http://statsd_exporter; + } + + location /alert_manager { + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header Host $host; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_pass http://alert_manager; + } +} diff --git a/scripts/deployment/deploy.sh b/scripts/deployment/deploy.sh index b56aa606ac..659448737c 100755 --- a/scripts/deployment/deploy.sh +++ b/scripts/deployment/deploy.sh @@ -4,41 +4,41 @@ set -e opt=${1} aws_login() { - aws configure set default.region us-east-1 - eval $(aws ecr get-login --no-include-email) + aws configure set default.region us-east-1 + eval $(aws ecr get-login --no-include-email) } if [ -z ${AWS_ACCOUNT_ID} ]; then - echo "AWS_ACCOUNT_ID not set." - exit 0 + echo "AWS_ACCOUNT_ID not set." + exit 0 fi if [ -z ${COMMIT_ID} ]; then - export COMMIT_ID="latest" + export COMMIT_ID="latest" fi if [ -z ${TRAVIS_BRANCH} ]; then - echo "Please set the TRAVIS_BRANCH first." + echo "Please set the TRAVIS_BRANCH first." fi env=${TRAVIS_BRANCH} JUMPBOX=${JUMPBOX_INSTANCE} if [[ ${env} == "production" ]]; then - INSTANCE=${PRODUCTION_INSTANCE} - MONITORING_INSTANCE=${PRODUCTION_MONITORING_INSTANCE} + INSTANCE=${PRODUCTION_INSTANCE} + MONITORING_INSTANCE=${PRODUCTION_MONITORING_INSTANCE} elif [[ ${env} == "staging" ]]; then - INSTANCE=${STAGING_INSTANCE} - MONITORING_INSTANCE=${STAGING_MONITORING_INSTANCE} + INSTANCE=${STAGING_INSTANCE} + MONITORING_INSTANCE=${STAGING_MONITORING_INSTANCE} else - echo "Skipping deployment since commit not on staging or production branch." - exit 0 + echo "Skipping deployment since commit not on staging or production branch." + exit 0 fi case $opt in - auto_deploy) - chmod 400 scripts/deployment/evalai.pem - ssh-add scripts/deployment/evalai.pem + auto_deploy) + chmod 400 scripts/deployment/evalai.pem + ssh-add scripts/deployment/evalai.pem ssh -A ubuntu@${JUMPBOX} -o StrictHostKeyChecking=no INSTANCE=${INSTANCE} MONITORING_INSTANCE=${MONITORING_INSTANCE} AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID} COMMIT_ID=${COMMIT_ID} env=${env} 'bash -s' <<-'ENDSSH' ssh ubuntu@${INSTANCE} -o StrictHostKeyChecking=no AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID} COMMIT_ID=${COMMIT_ID} env=${env} 'bash -s' <<-'ENDSSH2' source venv/bin/activate @@ -49,182 +49,182 @@ case $opt in eval $(aws ecr get-login --no-include-email) aws s3 cp s3://cloudcv-secrets/evalai/${env}/docker_${env}.env ./docker/prod/docker_${env}.env docker-compose -f docker-compose-${env}.yml rm -s -v -f - docker-compose -f docker-compose-${env}.yml pull django nodejs celery - docker-compose -f docker-compose-${env}.yml up -d --force-recreate --remove-orphans django nodejs nodejs_v2 celery + docker-compose -f docker-compose-${env}.yml pull django nodejs celery node_exporter + docker-compose -f docker-compose-${env}.yml up -d --force-recreate --remove-orphans django nodejs nodejs_v2 node_exporter celery ENDSSH2 ssh ubuntu@${MONITORING_INSTANCE} -o StrictHostKeyChecking=no AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID} COMMIT_ID=${COMMIT_ID} env=${env} 'bash -s' <<-'ENDSSH2' source venv/bin/activate - cd ~/Projects/EvalAI + cd ~/Projects/EvalAI export AWS_ACCOUNT_ID=${AWS_ACCOUNT_ID} export COMMIT_ID=${COMMIT_ID} export AWS_DEFAULT_REGION=us-east-1 eval $(aws ecr get-login --no-include-email) aws s3 cp s3://cloudcv-secrets/evalai/${env}/docker_${env}.env ./docker/prod/docker_${env}.env - aws s3 cp s3://cloudcv-secrets/evalai/${env}/alert_manager.yml ./monitoring/prometheus/alert_manager.yml + aws s3 cp s3://cloudcv-secrets/evalai/${env}/alert_manager.yml ./monitoring/prometheus/alert_manager.yml docker-compose -f docker-compose-${env}.yml rm -s -v -f docker-compose -f docker-compose-${env}.yml pull nginx-ingress prometheus grafana statsd-exporter alert-manager docker-compose -f docker-compose-${env}.yml up -d --force-recreate --remove-orphans nginx-ingress prometheus grafana statsd-exporter alert-manager ENDSSH2 ENDSSH - ;; - pull) - aws_login; - echo "Pulling environment variables file..." - aws s3 cp s3://cloudcv-secrets/evalai/${env}/docker_${env}.env ./docker/prod/docker_${env}.env - echo "Environment varibles file successfully downloaded." - echo "Pulling docker images from ECR..." - docker-compose -f docker-compose-${env}.yml pull - echo "Completed Pull operation." - ;; - deploy-django) - echo "Deploying django docker container..." - docker-compose -f docker-compose-${env}.yml up -d django - echo "Completed deploy operation." - ;; - deploy-nodejs) - echo "Deploying nodejs docker container..." - docker-compose -f docker-compose-${env}.yml up -d nodejs - echo "Completed deploy operation." - ;; - deploy-nodejs-v2) - echo "Deploying new frontend docker container..." - docker-compose -f docker-compose-${env}.yml up -d nodejs_v2 - echo "Completed deploy operation." - ;; - deploy-celery) - echo "Deploying celery docker container..." - docker-compose -f docker-compose-${env}.yml up -d celery - echo "Completed deploy operation." - ;; - deploy-worker) - token=${3} - challenge=${4} - if [ -z "$4" ]; then - echo "Please input Challenge ID" - exit 0 - fi - echo "Pulling queue name for $env server challenge..." - if [ ${env} == "staging" ]; then - queue_name=$(curl -k -L -X GET -H "Authorization: Token $token" https://staging.eval.ai/api/challenges/get_broker_url/$challenge/) - elif [ ${env} == "production" ]; then - queue_name=$(curl -k -L -X GET -H "Authorization: Token $token" https://eval.ai/api/challenges/get_broker_url/$challenge/) - fi - echo "Completed pulling Queue name" - # preprocess the python list to bash array - queue_name=($(echo ${queue_name//,/ } | tr -d '[]')) - queue=$(echo $queue_name | tr -d '"') - echo "Deploying worker for queue: " $queue - docker-compose -f docker-compose-${env}.yml run --name=worker_${queue} -e CHALLENGE_QUEUE=$queue -e CHALLENGE_PK=$challenge -d worker - echo "Deployed worker docker container for queue: " $queue - ;; - deploy-remote-worker) - token=${3} - broker_url=${4} - if [ -z "$3" ]; then - echo "Please input Auth Token" - exit 0 - fi - if [ -z "$4" ]; then - echo "Please input Broker URL" - exit 0 - fi - echo "Deploying worker for queue: " $queue - docker-compose -f docker-compose-${env}.yml run --name=remote_worker_${queue} -e QUEUE_NAME=$queue -e AUTH_TOKEN=$token -d worker - echo "Deployed worker docker container for queue: " $queue - ;; - deploy-workers) - token=${3} - echo "Pulling queue names for $env server challenges..." - if [ ${env} == "staging" ]; then - queue_names=$(curl -k -L -X GET -H "Authorization: Token $token" https://staging.eval.ai/api/challenges/get_broker_urls/) - elif [ ${env} == "production" ]; then - queue_names=$(curl -k -L -X GET -H "Authorization: Token $token" https://eval.ai/api/challenges/get_broker_urls/) - fi - echo "Completed pulling Queue list" - # preprocess the python list to bash array - queue_names=($(echo ${queue_names//,/ } | tr -d '[]')) - for queue_name in "${queue_names[@]}" - do - queue=$(echo $queue_name | tr -d '"') - echo "Deploying worker for queue: " $queue - docker-compose -f docker-compose-${env}.yml run --name=worker_${queue} -e CHALLENGE_QUEUE=$queue -d worker - echo "Deployed worker docker container for queue: " $queue - done - ;; - deploy-prometheus) - echo "Deploying prometheus docker container..." - docker-compose -f docker-compose-${env}.yml up -d prometheus - echo "Completed deploy operation." - ;; - deploy-grafana) - echo "Deploying grafana docker container..." - docker-compose -f docker-compose-${env}.yml up -d grafana - echo "Completed deploy operation." - ;; - deploy-statsd) - echo "Deploying statsd docker container..." - docker-compose -f docker-compose-${env}.yml up -d statsd-exporter - echo "Completed deploy operation." - ;; - deploy-node-exporter) - echo "Deploying node_exporter docker container..." - docker-compose -f docker-compose-${env}.yml up -d node_exporter - echo "Completed deploy operation." - ;; - deploy-alert-manager) - echo "Deploying alertmanager docker container..." - docker-compose -f docker-compose-${env}.yml up -d alert-manager - echo "Completed deploy operation." - ;; - scale) - service=${3} - instances=${4} - echo "Scaling the containers..." - docker-compose -f docker-compose-${env}.yml scale ${service}=${instances} - ;; - clean) - { - docker-compose -f docker-compose-${env}.yml rm -s -v -f - } || { - echo "Delete operation skipped since no container or image found!" - } - docker rmi $(docker images -a -q) - echo "Sucessfully cleaned all the images." - ;; - *) - echo "EvalAI deployment utility script" - echo " Usage: $0 {pull|deploy|scale|clean}" - echo - echo " auto_deploy : Deploy staging or production branch to staging or production server respectively." - echo " Eg. ./scripts/deployment/deploy.sh auto_deploy" - echo " pull : Pull docker images from ECR." - echo " Eg. ./scripts/deployment/deploy.sh pull production" - echo " deploy-django : Deploy django containers in the respective environment." - echo " Eg. ./scripts/deployment/deploy.sh deploy-django production" - echo " deploy-nodejs : Deploy nodejs containers in the respective environment." - echo " Eg. ./scripts/deployment/deploy.sh deploy-nodejs production" - echo " deploy-nodejs-v2 : Deploy new frontend container in the respective environment." - echo " Eg. ./scripts/deployment/deploy.sh deploy-nodejs-v2 production" - echo " deploy-celery : Deploy celery containers in the respective environment." - echo " Eg. ./scripts/deployment/deploy.sh deploy-celery production" - echo " deploy-worker : Deploy worker container for a challenge using challenge pk." - echo " Eg. ./scripts/deployment/deploy.sh deploy-worker production " - echo " deploy-remote-worker : Deploy remote worker container for a challenge using host auth token and challenge queue name." - echo " Eg. ./scripts/deployment/deploy.sh deploy-remote-worker production " - echo " deploy-workers : Deploy worker containers in the respective environment." - echo " Eg. ./scripts/deployment/deploy.sh deploy production " - echo " deploy-prometheus : Deploy prometheus container in the respective environment." - echo " Eg. ./scripts/deployment/deploy.sh deploy-prometheus production" - echo " deploy-grafana : Deploy grafana container in the respective environment." - echo " Eg. ./scripts/deployment/deploy.sh deploy-grafana production" - echo " deploy-statsd : Deploy statsd container in the respective environment." - echo " Eg. ./scripts/deployment/deploy.sh deploy-statsd production" - echo " deploy-node-exporter : Deploy node_exporter container in the respective environment." - echo " Eg. ./scripts/deployment/deploy.sh deploy-node-exporter production" - echo " deploy-alert-manager : Deploy alertmanager container in the respective environment." - echo " Eg. ./scripts/deployment/deploy.sh deploy-alert-manager production" - echo " scale : Scale particular docker service in an environment." - echo " Eg. ./scripts/deployment/deploy.sh scale production django 5" - echo " clean : Remove all docker containers and images." - echo " Eg. ./scripts/deployment/deploy.sh clean production" + ;; + pull) + aws_login; + echo "Pulling environment variables file..." + aws s3 cp s3://cloudcv-secrets/evalai/${env}/docker_${env}.env ./docker/prod/docker_${env}.env + echo "Environment varibles file successfully downloaded." + echo "Pulling docker images from ECR..." + docker-compose -f docker-compose-${env}.yml pull + echo "Completed Pull operation." + ;; + deploy-django) + echo "Deploying django docker container..." + docker-compose -f docker-compose-${env}.yml up -d django + echo "Completed deploy operation." + ;; + deploy-nodejs) + echo "Deploying nodejs docker container..." + docker-compose -f docker-compose-${env}.yml up -d nodejs + echo "Completed deploy operation." + ;; + deploy-nodejs-v2) + echo "Deploying new frontend docker container..." + docker-compose -f docker-compose-${env}.yml up -d nodejs_v2 + echo "Completed deploy operation." + ;; + deploy-celery) + echo "Deploying celery docker container..." + docker-compose -f docker-compose-${env}.yml up -d celery + echo "Completed deploy operation." + ;; + deploy-worker) + token=${3} + challenge=${4} + if [ -z "$4" ]; then + echo "Please input Challenge ID" + exit 0 + fi + echo "Pulling queue name for $env server challenge..." + if [ ${env} == "staging" ]; then + queue_name=$(curl -k -L -X GET -H "Authorization: Token $token" https://staging.eval.ai/api/challenges/get_broker_url/$challenge/) + elif [ ${env} == "production" ]; then + queue_name=$(curl -k -L -X GET -H "Authorization: Token $token" https://eval.ai/api/challenges/get_broker_url/$challenge/) + fi + echo "Completed pulling Queue name" + # preprocess the python list to bash array + queue_name=($(echo ${queue_name//,/ } | tr -d '[]')) + queue=$(echo $queue_name | tr -d '"') + echo "Deploying worker for queue: " $queue + docker-compose -f docker-compose-${env}.yml run --name=worker_${queue} -e CHALLENGE_QUEUE=$queue -e CHALLENGE_PK=$challenge -d worker + echo "Deployed worker docker container for queue: " $queue + ;; + deploy-remote-worker) + token=${3} + broker_url=${4} + if [ -z "$3" ]; then + echo "Please input Auth Token" + exit 0 + fi + if [ -z "$4" ]; then + echo "Please input Broker URL" + exit 0 + fi + echo "Deploying worker for queue: " $queue + docker-compose -f docker-compose-${env}.yml run --name=remote_worker_${queue} -e QUEUE_NAME=$queue -e AUTH_TOKEN=$token -d worker + echo "Deployed worker docker container for queue: " $queue + ;; + deploy-workers) + token=${3} + echo "Pulling queue names for $env server challenges..." + if [ ${env} == "staging" ]; then + queue_names=$(curl -k -L -X GET -H "Authorization: Token $token" https://staging.eval.ai/api/challenges/get_broker_urls/) + elif [ ${env} == "production" ]; then + queue_names=$(curl -k -L -X GET -H "Authorization: Token $token" https://eval.ai/api/challenges/get_broker_urls/) + fi + echo "Completed pulling Queue list" + # preprocess the python list to bash array + queue_names=($(echo ${queue_names//,/ } | tr -d '[]')) + for queue_name in "${queue_names[@]}" + do + queue=$(echo $queue_name | tr -d '"') + echo "Deploying worker for queue: " $queue + docker-compose -f docker-compose-${env}.yml run --name=worker_${queue} -e CHALLENGE_QUEUE=$queue -d worker + echo "Deployed worker docker container for queue: " $queue + done + ;; + deploy-prometheus) + echo "Deploying prometheus docker container..." + docker-compose -f docker-compose-${env}.yml up -d prometheus + echo "Completed deploy operation." + ;; + deploy-grafana) + echo "Deploying grafana docker container..." + docker-compose -f docker-compose-${env}.yml up -d grafana + echo "Completed deploy operation." + ;; + deploy-statsd) + echo "Deploying statsd docker container..." + docker-compose -f docker-compose-${env}.yml up -d statsd-exporter + echo "Completed deploy operation." + ;; + deploy-node-exporter) + echo "Deploying node_exporter docker container..." + docker-compose -f docker-compose-${env}.yml up -d node_exporter + echo "Completed deploy operation." + ;; + deploy-alert-manager) + echo "Deploying alertmanager docker container..." + docker-compose -f docker-compose-${env}.yml up -d alert-manager + echo "Completed deploy operation." + ;; + scale) + service=${3} + instances=${4} + echo "Scaling the containers..." + docker-compose -f docker-compose-${env}.yml scale ${service}=${instances} + ;; + clean) + { + docker-compose -f docker-compose-${env}.yml rm -s -v -f + } || { + echo "Delete operation skipped since no container or image found!" + } + docker rmi $(docker images -a -q) + echo "Sucessfully cleaned all the images." + ;; + *) + echo "EvalAI deployment utility script" + echo " Usage: $0 {pull|deploy|scale|clean}" + echo + echo " auto_deploy : Deploy staging or production branch to staging or production server respectively." + echo " Eg. ./scripts/deployment/deploy.sh auto_deploy" + echo " pull : Pull docker images from ECR." + echo " Eg. ./scripts/deployment/deploy.sh pull production" + echo " deploy-django : Deploy django containers in the respective environment." + echo " Eg. ./scripts/deployment/deploy.sh deploy-django production" + echo " deploy-nodejs : Deploy nodejs containers in the respective environment." + echo " Eg. ./scripts/deployment/deploy.sh deploy-nodejs production" + echo " deploy-nodejs-v2 : Deploy new frontend container in the respective environment." + echo " Eg. ./scripts/deployment/deploy.sh deploy-nodejs-v2 production" + echo " deploy-celery : Deploy celery containers in the respective environment." + echo " Eg. ./scripts/deployment/deploy.sh deploy-celery production" + echo " deploy-worker : Deploy worker container for a challenge using challenge pk." + echo " Eg. ./scripts/deployment/deploy.sh deploy-worker production " + echo " deploy-remote-worker : Deploy remote worker container for a challenge using host auth token and challenge queue name." + echo " Eg. ./scripts/deployment/deploy.sh deploy-remote-worker production " + echo " deploy-workers : Deploy worker containers in the respective environment." + echo " Eg. ./scripts/deployment/deploy.sh deploy production " + echo " deploy-prometheus : Deploy prometheus container in the respective environment." + echo " Eg. ./scripts/deployment/deploy.sh deploy-prometheus production" + echo " deploy-grafana : Deploy grafana container in the respective environment." + echo " Eg. ./scripts/deployment/deploy.sh deploy-grafana production" + echo " deploy-statsd : Deploy statsd container in the respective environment." + echo " Eg. ./scripts/deployment/deploy.sh deploy-statsd production" + echo " deploy-node-exporter : Deploy node_exporter container in the respective environment." + echo " Eg. ./scripts/deployment/deploy.sh deploy-node-exporter production" + echo " deploy-alert-manager : Deploy alertmanager container in the respective environment." + echo " Eg. ./scripts/deployment/deploy.sh deploy-alert-manager production" + echo " scale : Scale particular docker service in an environment." + echo " Eg. ./scripts/deployment/deploy.sh scale production django 5" + echo " clean : Remove all docker containers and images." + echo " Eg. ./scripts/deployment/deploy.sh clean production" esac