Skip to content

Commit

Permalink
Add support for linux al2
Browse files Browse the repository at this point in the history
  • Loading branch information
khanhntd committed Aug 30, 2022
1 parent 5b7e3a2 commit cfae917
Show file tree
Hide file tree
Showing 6 changed files with 70 additions and 32 deletions.
58 changes: 43 additions & 15 deletions .github/workflows/integrationTest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ jobs:
name: 'GenerateTestMatrix'
runs-on: ubuntu-latest
outputs:
ec2_linux_gpu_matrix: ${{ steps.set-matrix.outputs.ec2_linux_gpu_matrix }}
ec2_gpu_matrix: ${{ steps.set-matrix.outputs.ec2_gpu_matrix }}
ec2_linux_matrix: ${{ steps.set-matrix.outputs.ec2_linux_matrix }}
ec2_performance_matrix: ${{steps.set-matrix.outputs.ec2_performance_matrix}}
ec2_windows_matrix: ${{ steps.set-matrix.outputs.ec2_windows_matrix }}
Expand All @@ -152,19 +152,19 @@ jobs:
id: set-matrix
run: |
go run --tags=generator integration/generator/test_case_generator.go
echo "::set-output name=ec2_linux_gpu_matrix::$(echo $(cat integration/generator/resources/ec2_linux_gpu_complete_test_matrix.json))"
echo "::set-output name=ec2_gpu_matrix::$(echo $(cat integration/generator/resources/ec2_gpu_complete_test_matrix.json))"
echo "::set-output name=ec2_linux_matrix::$(echo $(cat integration/generator/resources/ec2_linux_complete_test_matrix.json))"
echo "::set-output name=ec2_performance_matrix::$(echo $(cat integration/generator/resources/ec2_performance_complete_test_matrix.json))"
echo "::set-output name=ec2_windows_matrix::$(echo $(cat integration/generator/resources/ec2_windows_complete_test_matrix.json))"
echo "::set-output name=ecs_fargate_matrix::$(echo $(cat integration/generator/resources/ecs_fargate_complete_test_matrix.json))"
- name: Echo test plan matrix
run: |
echo ${{ steps.set-matrix.outputs.ec2_linux_gpu_matrix }}
echo ${{ steps.set-matrix.outputs.ec2_linux_matrix }}
echo ${{ steps.set-matrix.outputs.ec2_performance_matrix}}
echo ${{ steps.set-matrix.outputs.ec2_windows_matrix }}
echo ${{ steps.set-matrix.outputs.ecs_fargate_matrix }}
echo "ec2_gpu_matrix: ${{ steps.set-matrix.outputs.ec2_gpu_matrix }}"
echo "ec2_linux_matrix: ${{ steps.set-matrix.outputs.ec2_linux_matrix }}"
echo "ec2_performance_matrix: ${{ steps.set-matrix.outputs.ec2_performance_matrix}}"
echo "ec2_windows_matrix: ${{ steps.set-matrix.outputs.ec2_windows_matrix }}"
echo "ecs_fargate_matrix${{ steps.set-matrix.outputs.ecs_fargate_matrix }}"
MakeMSIZip:
name: 'MakeMSIZip'
Expand Down Expand Up @@ -398,14 +398,14 @@ jobs:
echo "::set-output name=local_stack_host_name::$LOCAL_STACK_HOST_NAME" &&
aws s3 cp terraform.tfstate s3://${S3_INTEGRATION_BUCKET}/integration-test/local-stack-terraform-state/${GITHUB_SHA}/terraform.tfstate
EC2LinuxGPUIntegrationTest:
EC2NvidiaGPUIntegrationTest:
needs: [ MakeBinary, StartLocalStack, GenerateTestMatrix ]
name: 'EC2LinuxIntegrationTest'
name: 'EC2NVIDIAGPUIntegrationTest'
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
arrays: ${{ fromJson(needs.GenerateTestMatrix.outputs.ec2_linux_gpu_matrix) }}
arrays: ${{ fromJson(needs.GenerateTestMatrix.outputs.ec2_gpu_matrix) }}
permissions:
id-token: write
contents: read
Expand All @@ -423,7 +423,7 @@ jobs:
uses: actions/cache@v2
with:
path: go.mod
key: ec2-linux-integration-test-${{ github.sha }}-${{ matrix.arrays.os }}-${{ matrix.arrays.arc }}-${{ matrix.arrays.test_dir }}
key: ec2-nvidia-integration-test-${{ github.sha }}-${{ matrix.arrays.os }}-${{ matrix.arrays.arc }}-${{ matrix.arrays.test_dir }}

- name: Echo Test Info
run: echo run on ec2 instance os ${{ matrix.arrays.os }} arc ${{ matrix.arrays.arc }} test dir ${{ matrix.arrays.test_dir }}
Expand All @@ -433,7 +433,7 @@ jobs:

# nick-invision/retry@v2 starts at base dir
- name: Terraform apply
if: steps.ec2-linux-integration-test.outputs.cache-hit != 'true'
if: ${{ matrix.arrays.family == "linux" && steps.ec2-nvidia-integration-test.outputs.cache-hit != 'true' }}
uses: nick-invision/retry@v2
with:
max_attempts: 3
Expand All @@ -460,15 +460,43 @@ jobs:
terraform destroy -auto-approve && exit 1
fi
- name: Terraform apply
if: ${{ matrix.arrays.family == "window" && steps.ec2-nvidia-integration-test.outputs.cache-hit != 'true' }}
uses: nick-invision/retry@v2
with:
max_attempts: 3
timeout_minutes: 30
retry_wait_seconds: 5
command: |
cd integration/terraform/ec2/win
terraform init
if terraform apply --auto-approve \
-var="ssh_key=${PRIVATE_KEY}" -var="key_name=${KEY_NAME}" \
-var="github_repo=${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}.git" \
-var="github_sha=${GITHUB_SHA}" -var="ami=${{ matrix.arrays.ami }}" \
-var="test_name=cw-integ-test-${{ matrix.arrays.os }}" \
-var="iam_instance_profile=${IAM_ROLE}" \
-var="vpc_security_group_ids=${VPC_SECURITY_GROUPS_IDS}" \
-var="s3_bucket=${S3_INTEGRATION_BUCKET}" ; then terraform destroy -auto-approve
else
terraform destroy -auto-approve && exit 1
fi
#This is here just in case workflow cancel
- name: Terraform destroy
if: ${{ cancelled() && steps.ec2-linux-integration-test.outputs.cache-hit != 'true' }}
if: ${{ cancelled() && steps.ec2-nvidia-integration-test.outputs.cache-hit != 'true' }}
uses: nick-invision/retry@v2
with:
max_attempts: 3
timeout_minutes: 8
retry_wait_seconds: 5
command: cd integration/terraform/ec2/linux && terraform destroy --auto-approve
command: |
if "${{ matrix.arrays.os }}" == window
cd integration/terraform/ec2/win
else
cd integration/terraform/ec2/linux
fi
terraform destroy --auto-approve
EC2LinuxIntegrationTest:
needs: [MakeBinary, StartLocalStack, GenerateTestMatrix]
Expand Down Expand Up @@ -606,7 +634,7 @@ jobs:
max_attempts: 3
timeout_minutes: 8
retry_wait_seconds: 5
command: cd cd integration/terraform/ec2/win && terraform destroy --auto-approve -var="ami=${{ matrix.arrays.ami }}"
command: cd integration/terraform/ec2/win && terraform destroy --auto-approve -var="ami=${{ matrix.arrays.ami }}"


StopLocalStack:
Expand Down
18 changes: 18 additions & 0 deletions integration/generator/resources/ec2_gpu_test_matrix.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[
{
"os": "al2",
"username": "ec2-user",
"instanceType":"g4dn.xlarge",
"installAgentCommand": "rpm -U ./amazon-cloudwatch-agent.rpm",
"ami": "cloudwatch-agent-integration-test-nvidia-gpu-al2*",
"caCertPath": "/etc/ssl/certs/ca-bundle.crt",
"arc": "amd64",
"binaryName": "amazon-cloudwatch-agent.rpm",
"family": "linux"
},
{
"os": "win-2019",
"ami": "cloudwatch-agent-integration-test-nvidia-gpu-win-2019*",
"family": "window"
}
]
12 changes: 0 additions & 12 deletions integration/generator/resources/ec2_linux_gpu_test_matrix.json

This file was deleted.

2 changes: 1 addition & 1 deletion integration/generator/test_case_generator.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ const (

//you can't have a const map in golang
var osToTestDirMap = map[string][]string{
"ec2_linux_gpu": {
"ec2_gpu": {
"./integration/test/nvidia_gpu",
},
"ec2_linux": {
Expand Down
8 changes: 5 additions & 3 deletions integration/terraform/ecs/linux/README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
Running ECS Fargate Integration Tests
=========================
## Prerequisite
* ECR Repository with the docker image

## 1. How ECS Fargate are set up?
## How ECS Fargate are set up?
**Step 1:** Create a Fargate ECS Cluster with the default VPC Network.
**Step 2:** Create a security group to assign to the service in step 5 which allows all inbound
traffics and outbound traffics
Expand All @@ -12,7 +14,7 @@ to decide which containers serve a specific task and assign the IAM roles in s
**Step 5:** Create a [service](https://docs.aws.amazon.com/AmazonECS/latest/developerguide/ecs_services.html) which configure
how many tasks are running in parallel and ensure availability of the task.

## 2. Setup resources
## Setup resources
By running `terraform apply -auto-approve -lock=false`,
you agree to setup the following resources:
* 1 IAM Task Role and 1 Execution Task Role (similar to [these IAM Roles](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/deploy_servicelens_CloudWatch_agent_deploy_ECS.html))
Expand Down Expand Up @@ -52,7 +54,7 @@ To be more specifically,
* **CloudWatchAgent Parameter Store:** Store CloudWatchAgent's configuration and CloudWatchAgent will pull the config from there. [Example configuration](default_resources/default_amazon_cloudwatch_agent.json)
* **Prometheus Parameter Store:** Store Prometheus's configuration and CloudWatchAgent will pull the config from there. [Example configuration](default_resources/default_ecs_prometheus.tpl)

## 3. Run tests in your AWS account
## Run tests in your AWS account
````
cd integration/terraform/ecs && terraform init && terraform apply -auto-approve \
-var="test_dir={{your test case folder name}} \
Expand Down
4 changes: 3 additions & 1 deletion integration/test/nvidia_gpu/metrics_nvidia_gpu_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ const (
numberofAppendDimensions = 1
)

var expectedMetrics = []string{"mem_used_percent","nvidia_smi_utilization_gpu","nvidia_smi_utilization_memory","nvidia_smi_power_draw","nvidia_smi_temperature_gpu"}

func TestNvidiaGPU(t *testing.T) {
t.Run("Basic configuration testing for both metrics and logs", func(t *testing.T) {
test.CopyFile(configJSON, configOutputPath)
Expand All @@ -36,7 +38,7 @@ func TestNvidiaGPU(t *testing.T) {
test.StopAgent()

dimensionFilter := test.BuildDimensionFilterList(numberofAppendDimensions)
for _, metricName := range []string{"mem_used_percent", "utilization_gpu","utilization_memory","temperature_gpu","power_draw"} {
for _, metricName := range expectedMetrics {
test.ValidateMetrics(t, metricName, namespace, dimensionFilter)
}

Expand Down

0 comments on commit cfae917

Please sign in to comment.