Skip to content

Commit

Permalink
Add NVIDIA GPU Integration Test and Security Checking for binaries (a…
Browse files Browse the repository at this point in the history
…ws#579)

* Change to ec2 linux avance and simple

* Add basic NVIDIA GPU for Linux

* Add basic check file control for windows

* Add basic nvidia check

* Finish security check for linux and macos

* Finish basic nvidia_gpu for security checking

* Add support for linux al2

* Separeate function between windows and linux

* Separeate function between windows and linux--amend

* Add basic windows util

* Add basic function for windows--amend

* Add basic function for windows--amend

* Add basic function for windows--amend

* Add basic function for windows--amend

* Add basic function for windows--amend

* Add basic function for windows--amend

* Add basic function for windows--amend

* Add basic function for windows--amend

* Add basic function for windows--amend

* Add basic function for windows--amend

* Add basic function for windows--amend

* Add basic function for windows--amend

* Add basic function for windows--amend

* Add basic function for windows--amend

* Add basic function for windows--amend

* Add support for linux al2

* Change to user data

* Basic OPENSSH

* Basic OPENSSH

* Return to test integration test on github

* Return to test integration test on github--amend

* Return to test integration test on github--amend

* Return to test integration test on github--amend

* Return to test integration test on github--amend

* Basic OPENSSH

* Basic OPENSSH

* Finish NVIDIA GPU--amend

* Finish NVIDIA GPU--amend

* Finish NVIDIA GPU--amend

* Finish NVIDIA GPU--amend

* Finish NVIDIA GPU--amend

* Finish NVIDIA GPU--amend

* Finish NVIDIA GPU

* revert some gpg

* Finish security checking and nvida gpu

Co-authored-by: Ameen <[email protected]>
  • Loading branch information
khanhntd and aateeqi authored Sep 8, 2022
1 parent 53040cd commit ab4b3f0
Show file tree
Hide file tree
Showing 34 changed files with 1,028 additions and 191 deletions.
115 changes: 109 additions & 6 deletions .github/workflows/integrationTest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -118,10 +118,12 @@ jobs:
${{ steps.login-ecr.outputs.registry }}/${{ env.ECR_INTEGRATION_TEST_REPO }}:${{ github.sha }}
platforms: linux/amd64, linux/arm64


GenerateTestMatrix:
name: 'GenerateTestMatrix'
runs-on: ubuntu-latest
outputs:
ec2_gpu_matrix: ${{ steps.set-matrix.outputs.ec2_gpu_matrix }}
ec2_linux_matrix: ${{ steps.set-matrix.outputs.ec2_linux_matrix }}
ec2_performance_matrix: ${{steps.set-matrix.outputs.ec2_performance_matrix}}
ec2_windows_matrix: ${{ steps.set-matrix.outputs.ec2_windows_matrix }}
Expand All @@ -138,17 +140,19 @@ jobs:
id: set-matrix
run: |
go run --tags=generator integration/generator/test_case_generator.go
echo "::set-output name=ec2_gpu_matrix::$(echo $(cat integration/generator/resources/ec2_gpu_complete_test_matrix.json))"
echo "::set-output name=ec2_linux_matrix::$(echo $(cat integration/generator/resources/ec2_linux_complete_test_matrix.json))"
echo "::set-output name=ec2_performance_matrix::$(echo $(cat integration/generator/resources/ec2_performance_complete_test_matrix.json))"
echo "::set-output name=ec2_windows_matrix::$(echo $(cat integration/generator/resources/ec2_windows_complete_test_matrix.json))"
echo "::set-output name=ecs_fargate_matrix::$(echo $(cat integration/generator/resources/ecs_fargate_complete_test_matrix.json))"
- name: Echo test plan matrix
run: |
echo ${{ steps.set-matrix.outputs.ec2_linux_matrix }}
echo ${{ steps.set-matrix.outputs.ec2_performance_matrix}}
echo ${{ steps.set-matrix.outputs.ec2_windows_matrix }}
echo ${{ steps.set-matrix.outputs.ecs_fargate_matrix }}
echo "ec2_gpu_matrix: ${{ steps.set-matrix.outputs.ec2_gpu_matrix }}"
echo "ec2_linux_matrix: ${{ steps.set-matrix.outputs.ec2_linux_matrix }}"
echo "ec2_performance_matrix: ${{ steps.set-matrix.outputs.ec2_performance_matrix}}"
echo "ec2_windows_matrix: ${{ steps.set-matrix.outputs.ec2_windows_matrix }}"
echo "ecs_fargate_matrix${{ steps.set-matrix.outputs.ecs_fargate_matrix }}"
MakeMSIZip:
name: 'MakeMSIZip'
Expand Down Expand Up @@ -382,6 +386,105 @@ jobs:
echo "::set-output name=local_stack_host_name::$LOCAL_STACK_HOST_NAME" &&
aws s3 cp terraform.tfstate s3://${S3_INTEGRATION_BUCKET}/integration-test/local-stack-terraform-state/${GITHUB_SHA}/terraform.tfstate
EC2NvidiaGPUIntegrationTest:
needs: [ MakeBinary, BuildMSI, StartLocalStack, GenerateTestMatrix ]
name: 'EC2NVIDIAGPUIntegrationTest'
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
arrays: ${{ fromJson(needs.GenerateTestMatrix.outputs.ec2_gpu_matrix) }}
permissions:
id-token: write
contents: read
steps:
- uses: actions/checkout@v2

- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v1
with:
role-to-assume: ${{ env.TERRAFORM_AWS_ASSUME_ROLE }}
aws-region: us-west-2

- name: Cache if success
id: ec2-linux-integration-test
uses: actions/cache@v2
with:
path: go.mod
key: ec2-nvidia-integration-test-${{ github.sha }}-${{ matrix.arrays.os }}-${{ matrix.arrays.arc }}-${{ matrix.arrays.test_dir }}

- name: Echo Test Info
run: echo run on ec2 instance os ${{ matrix.arrays.os }} arc ${{ matrix.arrays.arc }} test dir ${{ matrix.arrays.test_dir }}

- name: Verify Terraform version
run: terraform --version

# nick-invision/retry@v2 starts at base dir
- name: Terraform apply
if: ${{ matrix.arrays.family == 'linux' && steps.ec2-nvidia-integration-test.outputs.cache-hit != 'true' }}
uses: nick-invision/retry@v2
with:
max_attempts: 3
timeout_minutes: 30
retry_wait_seconds: 5
command: |
cd integration/terraform/ec2/linux
terraform init
if terraform apply --auto-approve \
-var="ssh_key_value=${PRIVATE_KEY}" -var="github_repo=${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}.git" \
-var="github_sha=${GITHUB_SHA}" -var="install_agent=${{ matrix.arrays.installAgentCommand }}" \
-var="ec2_instance_type=${{ matrix.arrays.instanceType }}" \
-var="user=${{ matrix.arrays.username }}" \
-var="ami=${{ matrix.arrays.ami }}" \
-var="ca_cert_path=${{ matrix.arrays.caCertPath }}" \
-var="arc=${{ matrix.arrays.arc }}" \
-var="binary_name=${{ matrix.arrays.binaryName }}" \
-var="local_stack_host_name=${{ needs.StartLocalStack.outputs.local_stack_host_name }}" \
-var="s3_bucket=${S3_INTEGRATION_BUCKET}" \
-var="ssh_key_name=${KEY_NAME}" \
-var="test_name=cw-integ-test-${{ matrix.arrays.os }}" \
-var="test_dir=${{ matrix.arrays.test_dir }}" ; then terraform destroy -auto-approve
else
terraform destroy -auto-approve && exit 1
fi
- name: Terraform apply
if: ${{ matrix.arrays.family == 'window' && steps.ec2-nvidia-integration-test.outputs.cache-hit != 'true' }}
uses: nick-invision/retry@v2
with:
max_attempts: 3
timeout_minutes: 30
retry_wait_seconds: 5
command: |
cd integration/terraform/ec2/win
terraform init
if terraform apply --auto-approve \
-var="ssh_key_value=${PRIVATE_KEY}" -var="ssh_key_name=${KEY_NAME}" \
-var="github_repo=${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}.git" \
-var="github_sha=${GITHUB_SHA}" -var="ami=${{ matrix.arrays.ami }}" \
-var="test_dir=${{ matrix.arrays.test_dir }}" \
-var="ec2_instance_type=${{ matrix.arrays.instanceType }}" \
-var="s3_bucket=${S3_INTEGRATION_BUCKET}" ; then terraform destroy -auto-approve
else
terraform destroy -auto-approve && exit 1
fi
#This is here just in case workflow cancel
- name: Terraform destroy
if: ${{ cancelled() && steps.ec2-nvidia-integration-test.outputs.cache-hit != 'true' }}
uses: nick-invision/retry@v2
with:
max_attempts: 3
timeout_minutes: 8
retry_wait_seconds: 5
command: |
if "${{ matrix.arrays.os }}" == window
cd integration/terraform/ec2/win
else
cd integration/terraform/ec2/linux
fi
terraform destroy --auto-approve
EC2LinuxIntegrationTest:
needs: [MakeBinary, StartLocalStack, GenerateTestMatrix]
name: 'EC2LinuxIntegrationTest'
Expand Down Expand Up @@ -502,7 +605,7 @@ jobs:
-var="ssh_key_value=${PRIVATE_KEY}" -var="ssh_key_name=${KEY_NAME}" \
-var="github_repo=${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}.git" \
-var="github_sha=${GITHUB_SHA}" -var="ami=${{ matrix.arrays.ami }}" \
-var="test_name=${{ matrix.arrays.os }}" \
-var="test_dir=${{ matrix.arrays.test_dir }}" \
-var="s3_bucket=${S3_INTEGRATION_BUCKET}" ; then
terraform destroy -auto-approve
else
Expand All @@ -517,7 +620,7 @@ jobs:
max_attempts: 3
timeout_minutes: 8
retry_wait_seconds: 5
command: cd cd integration/terraform/ec2/win && terraform destroy --auto-approve -var="ami=${{ matrix.arrays.ami }}"
command: cd integration/terraform/ec2/win && terraform destroy --auto-approve -var="ami=${{ matrix.arrays.ami }}"


StopLocalStack:
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -237,4 +237,4 @@ dockerized-build:

# Use vendor instead of proxy when building w/ vendor folder
dockerized-build-vendor:
$(DOCKER_BUILD_FROM_SOURCE) --build-arg GO111MODULE=off .
$(DOCKER_BUILD_FROM_SOURCE) --build-arg GO111MODULE=off .
137 changes: 86 additions & 51 deletions cmd/amazon-cloudwatch-agent/amazon-cloudwatch-agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ import (
"github.com/aws/amazon-cloudwatch-agent/cfg/migrate"
"github.com/aws/amazon-cloudwatch-agent/logs"
"github.com/aws/amazon-cloudwatch-agent/profiler"

"github.com/aws/amazon-cloudwatch-agent/cmd/amazon-cloudwatch-agent/internal"
_ "github.com/aws/amazon-cloudwatch-agent/plugins"

"github.com/influxdata/telegraf/agent"
"github.com/influxdata/telegraf/config"
"github.com/influxdata/telegraf/logger"

//_ "github.com/influxdata/telegraf/plugins/aggregators/all"
"github.com/influxdata/telegraf/plugins/inputs"
//_ "github.com/influxdata/telegraf/plugins/inputs/all"
Expand Down Expand Up @@ -236,58 +236,16 @@ func runAgent(ctx context.Context,
c.OutputFilters = outputFilters
c.InputFilters = inputFilters

isOld, err := migrate.IsOldConfig(*fConfig)
if err != nil {
log.Printf("W! Failed to detect if config file is old format: %v", err)
}

if isOld {
migratedConfFile, err := migrate.MigrateFile(*fConfig)
if err != nil {
log.Printf("W! Failed to migrate old config format file %v: %v", *fConfig, err)
}

err = c.LoadConfig(migratedConfFile)
if err != nil {
return err
}

agentinfo.BuildStr += "_M"
} else {
err = c.LoadConfig(*fConfig)
if err != nil {
return err
}
}

if *fConfigDirectory != "" {
err = c.LoadDirectory(*fConfigDirectory)
if err != nil {
return err
}
}
if !*fTest && len(c.Outputs) == 0 {
return errors.New("Error: no outputs found, did you provide a valid config file?")
}
if len(c.Inputs) == 0 {
return errors.New("Error: no inputs found, did you provide a valid config file?")
}
err = loadTomlConfigIntoAgent(c)

if int64(c.Agent.Interval) <= 0 {
return fmt.Errorf("Agent interval must be positive, found %v",
c.Agent.Interval)
if err != nil {
return err
}

if int64(c.Agent.FlushInterval) <= 0 {
return fmt.Errorf("Agent flush_interval must be positive; found %v",
c.Agent.FlushInterval)
}
err = validateAgentFinalConfigAndPlugins(c)

if *fSchemaTest {
//up to this point, the given config file must be valid
fmt.Println(agentinfo.FullVersion())
fmt.Printf("The given config: %v is valid\n", *fConfig)
os.Exit(0)
if err != nil {
return err
}

ag, err := agent.NewAgent(c)
Expand Down Expand Up @@ -507,7 +465,7 @@ func main() {
}
return
}

if runtime.GOOS == "windows" && windowsRunAsService() {
programFiles := os.Getenv("ProgramFiles")
if programFiles == "" { // Should never happen
Expand Down Expand Up @@ -584,3 +542,80 @@ func windowsRunAsService() bool {

return !service.Interactive()
}

func loadTomlConfigIntoAgent(c *config.Config) error{
isOld, err := migrate.IsOldConfig(*fConfig)
if err != nil {
log.Printf("W! Failed to detect if config file is old format: %v", err)
}

if isOld {
migratedConfFile, err := migrate.MigrateFile(*fConfig)
if err != nil {
log.Printf("W! Failed to migrate old config format file %v: %v", *fConfig, err)
}

err = c.LoadConfig(migratedConfFile)
if err != nil {
return err
}

agentinfo.BuildStr += "_M"
} else {
err = c.LoadConfig(*fConfig)
if err != nil {
return err
}
}

if *fConfigDirectory != "" {
err = c.LoadDirectory(*fConfigDirectory)
if err != nil {
return err
}
}

return nil
}

func validateAgentFinalConfigAndPlugins(c *config.Config) error{
if !*fTest && len(c.Outputs) == 0 {
return errors.New("Error: no outputs found, did you provide a valid config file?")
}
if len(c.Inputs) == 0 {
return errors.New("Error: no inputs found, did you provide a valid config file?")
}

if int64(c.Agent.Interval) <= 0 {
return fmt.Errorf("Agent interval must be positive, found %v", c.Agent.Interval)
}

if int64(c.Agent.FlushInterval) <= 0 {
return fmt.Errorf("Agent flush_interval must be positive; found %v", c.Agent.FlushInterval)
}

if inputPlugin, err := checkRightForBinariesFileWithInputPlugins(c.InputNames()); err != nil {
return fmt.Errorf("Validate input plugin %s failed because of %v", inputPlugin, err)
}

if *fSchemaTest {
//up to this point, the given config file must be valid
fmt.Println(agentinfo.FullVersion())
fmt.Printf("The given config: %v is valid\n", *fConfig)
os.Exit(0)
}

return nil
}

func checkRightForBinariesFileWithInputPlugins(inputPlugins []string) (string, error) {
for _, inputPlugin := range inputPlugins {
if inputPlugin == "nvidia_smi" {
if err := internal.CheckNvidiaSMIBinaryRights(); err != nil {
return "nvidia_smi", err
}
}
}

return "", nil
}
20 changes: 20 additions & 0 deletions cmd/amazon-cloudwatch-agent/internal/check_plugins_unix.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: MIT

//go:build !windows
// +build !windows

package internal

import (
"github.com/aws/amazon-cloudwatch-agent/internal/util/security"
"github.com/aws/amazon-cloudwatch-agent/translator/translate/metrics/util"
)


func CheckNvidiaSMIBinaryRights() error {
if err := security.CheckFileRights(util.Default_Unix_Smi_Path); err != nil{
return err
}
return nil
}
19 changes: 19 additions & 0 deletions cmd/amazon-cloudwatch-agent/internal/check_plugins_windows.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: MIT

//go:build windows
// +build windows

package internal

import (
"github.com/aws/amazon-cloudwatch-agent/internal/util/security"
"github.com/aws/amazon-cloudwatch-agent/translator/translate/metrics/util"
)

func CheckNvidiaSMIBinaryRights() error {
if err := security.CheckFileRights(util.Default_Windows_Smi_Path); err != nil{
return err
}
return nil
}
Loading

0 comments on commit ab4b3f0

Please sign in to comment.