From b103ff00d0f83608037733079afdce70db8bb9f7 Mon Sep 17 00:00:00 2001 From: Amanuel Engeda <74629455+engedaam@users.noreply.github.com> Date: Wed, 24 Apr 2024 14:23:26 -0700 Subject: [PATCH] chore: Forward Container logs for E2E tests (#5982) --- .../e2e/run-tests-private-cluster/action.yaml | 4 ++++ .github/actions/e2e/setup-cluster/action.yaml | 15 ++++++++++----- .github/workflows/e2e-upgrade.yaml | 11 ++++++++++- .github/workflows/e2e.yaml | 11 ++++++++++- 4 files changed, 34 insertions(+), 7 deletions(-) diff --git a/.github/actions/e2e/run-tests-private-cluster/action.yaml b/.github/actions/e2e/run-tests-private-cluster/action.yaml index 64504b9b6d58..8203553ebea4 100644 --- a/.github/actions/e2e/run-tests-private-cluster/action.yaml +++ b/.github/actions/e2e/run-tests-private-cluster/action.yaml @@ -125,6 +125,10 @@ runs: - kubectl delete ec2nodeclass --all - kubectl delete deployment --all - PRIVATE_CLUSTER=$CLUSTER_NAME TEST_SUITE=$SUITE ENABLE_METRICS=$ENABLE_METRICS METRICS_REGION=$METRICS_REGION GIT_REF="$(git rev-parse HEAD)" CLUSTER_NAME=$CLUSTER_NAME CLUSTER_ENDPOINT="$(aws eks describe-cluster --name $CLUSTER_NAME --query "cluster.endpoint" --output text)" INTERRUPTION_QUEUE=$CLUSTER_NAME make e2etests + - aws logs put-retention-policy --log-group-name /aws/containerinsights/$CLUSTER_NAME/application --retention-in-days 30 + - aws logs put-retention-policy --log-group-name /aws/containerinsights/$CLUSTER_NAME/dataplane --retention-in-days 30 + - aws logs put-retention-policy --log-group-name /aws/containerinsights/$CLUSTER_NAME/host --retention-in-days 30 + - aws logs put-retention-policy --log-group-name /aws/containerinsights/$CLUSTER_NAME/performance --retention-in-days 30 post_build: commands: # Describe karpenter pods diff --git a/.github/actions/e2e/setup-cluster/action.yaml b/.github/actions/e2e/setup-cluster/action.yaml index ffda793d2fc0..aec619949048 100644 --- a/.github/actions/e2e/setup-cluster/action.yaml +++ b/.github/actions/e2e/setup-cluster/action.yaml @@ -30,7 +30,7 @@ inputs: default: "1.29" eksctl_version: description: "Version of eksctl to install" - default: v0.169.0 + default: v0.175.0 ip_family: description: "IP Family of the cluster. Valid values are IPv4 or IPv6" default: "IPv4" @@ -152,11 +152,9 @@ runs: minSize: 2 maxSize: 2 iam: + withAddonPolicies: + cloudWatch: true instanceRolePermissionsBoundary: "arn:aws:iam::$ACCOUNT_ID:policy/GithubActionsPermissionsBoundary" - taints: - - key: CriticalAddonsOnly - value: "true" - effect: NoSchedule cloudWatch: clusterLogging: enableTypes: ["*"] @@ -175,6 +173,8 @@ runs: $KARPENTER_IAM withOIDC: true addons: + - name: amazon-cloudwatch-observability + permissionsBoundary: "arn:aws:iam::$ACCOUNT_ID:policy/GithubActionsPermissionsBoundary" - name: vpc-cni permissionsBoundary: "arn:aws:iam::$ACCOUNT_ID:policy/GithubActionsPermissionsBoundary" - name: coredns @@ -211,6 +211,11 @@ runs: else eksctl ${cmd} cluster -f clusterconfig.yaml fi + + # Adding taints after all necessary pods have scheduled to the manged node group nodes + # amazon-cloudwatch-observability pods do no not tolerate CriticalAddonsOnly=true:NoSchedule and + # amazon-cloudwatch-observability addons does not allow to add tolerations to the addon pods as part of the advanced configuration + kubectl taint nodes CriticalAddonsOnly=true:NoSchedule --all - name: tag oidc provider of the cluster if: always() shell: bash diff --git a/.github/workflows/e2e-upgrade.yaml b/.github/workflows/e2e-upgrade.yaml index 0ad5a6e3f3e0..2c6b8a8c0f28 100644 --- a/.github/workflows/e2e-upgrade.yaml +++ b/.github/workflows/e2e-upgrade.yaml @@ -90,7 +90,7 @@ jobs: region: ${{ inputs.region }} cluster_name: ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} k8s_version: ${{ inputs.k8s_version }} - eksctl_version: v0.169.0 + eksctl_version: v0.175.0 ip_family: IPv4 # Set the value to IPv6 if IPv6 suite, else IPv4 git_ref: ${{ inputs.from_git_ref }} ecr_account_id: ${{ vars.SNAPSHOT_ACCOUNT_ID }} @@ -135,6 +135,15 @@ jobs: url: ${{ secrets.SLACK_WEBHOOK_URL }} suite: Upgrade git_ref: ${{ inputs.to_git_ref }} + - name: add log retention policy + if: ${{ inputs.workflow_trigger != 'private_cluster' }} + env: + CLUSTER_NAME: ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} + run: | + aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/application --retention-in-days 30 + aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/dataplane --retention-in-days 30 + aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/host --retention-in-days 30 + aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/performance --retention-in-days 30 - name: dump logs on failure uses: ./.github/actions/e2e/dump-logs if: failure() || cancelled() diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index dc120754be4a..636c19c77b9c 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -132,7 +132,7 @@ jobs: region: ${{ inputs.region }} cluster_name: ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} k8s_version: ${{ inputs.k8s_version }} - eksctl_version: v0.169.0 + eksctl_version: v0.175.0 ip_family: ${{ contains(inputs.suite, 'IPv6') && 'IPv6' || 'IPv4' }} # Set the value to IPv6 if IPv6 suite, else IPv4 private_cluster: ${{ inputs.workflow_trigger == 'private_cluster' }} git_ref: ${{ inputs.git_ref }} @@ -187,6 +187,15 @@ jobs: suite: ${{ inputs.suite }} git_ref: ${{ inputs.git_ref }} workflow_trigger: ${{ inputs.workflow_trigger }} + - name: add log retention policy + if: ${{ inputs.workflow_trigger != 'private_cluster' }} + env: + CLUSTER_NAME: ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} + run: | + aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/application --retention-in-days 30 + aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/dataplane --retention-in-days 30 + aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/host --retention-in-days 30 + aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/performance --retention-in-days 30 - name: dump logs on failure uses: ./.github/actions/e2e/dump-logs if: (failure() || cancelled()) && inputs.workflow_trigger != 'private_cluster'