From 5d7feed50b49228a141fa6682b5908719ca1a71f Mon Sep 17 00:00:00 2001 From: Harry Date: Wed, 13 Dec 2023 16:29:24 -0800 Subject: [PATCH] Add Retry Mechanism to E2E EC2 Terraform Deployment (#635) * Add Retry Mechanism to E2E EC2 Terraform Deployment * Add Extra Comments * Refactor code --- .github/workflows/appsignals-e2e-ec2-test.yml | 88 +++++++++++++------ .github/workflows/appsignals-e2e-eks-test.yml | 2 +- 2 files changed, 64 insertions(+), 26 deletions(-) diff --git a/.github/workflows/appsignals-e2e-ec2-test.yml b/.github/workflows/appsignals-e2e-ec2-test.yml index dcbe2019df..6d1334e15c 100644 --- a/.github/workflows/appsignals-e2e-ec2-test.yml +++ b/.github/workflows/appsignals-e2e-ec2-test.yml @@ -56,18 +56,72 @@ jobs: with: terraform_wrapper: false - - name: Deploy sample app via terraform + - name: Deploy sample app via terraform and wait for endpoint to come online working-directory: testing/terraform/ec2 run: | terraform init terraform validate - terraform apply -auto-approve \ - -var="aws_region=${{ env.AWS_DEFAULT_REGION }}" \ - -var="test_id=${{ env.TESTING_ID }}" \ - -var="sample_app_jar=${{ env.SAMPLE_APP_FRONTEND_SERVICE_JAR }}" \ - -var="sample_remote_app_jar=${{ env.SAMPLE_APP_REMOTE_SERVICE_JAR }}" \ - -var="cw_agent_rpm=${{ env.APP_SIGNALS_CW_AGENT_RPM }}" \ - -var="adot_jar=${{ env.APP_SIGNALS_ADOT_JAR }}" + + # Attempt to deploy the sample app on an EC2 instance and wait for its endpoint to come online. + # There may be occasional failures due to transitivity issues, so try up to 2 times. + # deployment_failed of 0 indicates that both the terraform deployment and the endpoint are running, while 1 indicates + # that it failed at some point + retry_counter=0 + max_retry=2 + while [ $retry_counter -lt $max_retry ]; do + echo "Attempt $retry_counter" + deployment_failed=0 + terraform apply -auto-approve \ + -var="aws_region=${{ env.AWS_DEFAULT_REGION }}" \ + -var="test_id=${{ env.TESTING_ID }}" \ + -var="sample_app_jar=${{ env.SAMPLE_APP_FRONTEND_SERVICE_JAR }}" \ + -var="sample_remote_app_jar=${{ env.SAMPLE_APP_REMOTE_SERVICE_JAR }}" \ + -var="cw_agent_rpm=${{ env.APP_SIGNALS_CW_AGENT_RPM }}" \ + -var="adot_jar=${{ env.APP_SIGNALS_ADOT_JAR }}" \ + || deployment_failed=$? + + if [ $deployment_failed -eq 1 ]; then + echo "Terraform deployment was unsuccessful. Will attempt to retry deployment." + fi + + # If the deployment_failed is still 0, then the terraform deployment succeeded and now try to connect to the endpoint. + # Attempts to connect will be made for up to 10 minutes + if [ $deployment_failed -eq 0 ]; then + echo "Attempting to connect to the endpoint" + sample_app_endpoint=http://$(terraform output sample_app_main_service_public_dns):8080 + attempt_counter=0 + max_attempts=60 + until $(curl --output /dev/null --silent --head --fail $(echo "$sample_app_endpoint" | tr -d '"')); do + if [ ${attempt_counter} -eq ${max_attempts} ];then + echo "Failed to connect to endpoint. Will attempt to redeploy sample app." + deployment_failed=1 + break + fi + + printf '.' + attempt_counter=$(($attempt_counter+1)) + sleep 10 + done + fi + + # If the success is 1 then either the terraform deployment or the endpoint connection failed, so first destroy the + # resources created from terraform and try again. + if [ $deployment_failed -eq 1 ]; then + echo "Destroying terraform" + terraform destroy -auto-approve \ + -var="test_id=${{ env.TESTING_ID }}" + + retry_counter=$(($retry_counter+1)) + else + # If deployment succeeded, then exit the loop + break + fi + + if [ $retry_counter -eq $max_retry ]; then + echo "Max retry reached, failed to deploy terraform and connect to the endpoint. Exiting code" + exit 1 + fi + done - name: Get the ec2 instance ami id run: | @@ -80,22 +134,6 @@ jobs: echo "REMOTE_SERVICE_IP=$(terraform output sample_app_remote_service_public_ip)" >> $GITHUB_ENV working-directory: testing/terraform/ec2 - - name: Wait for app endpoint to come online - id: endpoint-check - run: | - attempt_counter=0 - max_attempts=30 - until $(curl --output /dev/null --silent --head --fail http://${{ env.MAIN_SERVICE_ENDPOINT }}); do - if [ ${attempt_counter} -eq ${max_attempts} ];then - echo "Max attempts reached" - exit 1 - fi - - printf '.' - attempt_counter=$(($attempt_counter+1)) - sleep 10 - done - # This steps increases the speed of the validation by creating the telemetry data in advance - name: Call all test APIs continue-on-error: true @@ -182,4 +220,4 @@ jobs: working-directory: testing/terraform/ec2 run: | terraform destroy -auto-approve \ - -var="test_id=${{ env.TESTING_ID }}" + -var="test_id=${{ env.TESTING_ID }}" \ No newline at end of file diff --git a/.github/workflows/appsignals-e2e-eks-test.yml b/.github/workflows/appsignals-e2e-eks-test.yml index b837105aa5..404865a940 100644 --- a/.github/workflows/appsignals-e2e-eks-test.yml +++ b/.github/workflows/appsignals-e2e-eks-test.yml @@ -332,4 +332,4 @@ jobs: --name service-account-${{ env.TESTING_ID }} \ --namespace ${{ env.SAMPLE_APP_NAMESPACE }} \ --cluster ${{ inputs.test-cluster-name }} \ - --region ${{ env.AWS_DEFAULT_REGION }} + --region ${{ env.AWS_DEFAULT_REGION }} \ No newline at end of file