Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add NVIDIA GPU Integration Test and Security Checking for binaries #579

Merged
merged 48 commits into from
Sep 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
f542039
Change to ec2 linux avance and simple
aateeqi Aug 19, 2022
9666c2b
Add basic NVIDIA GPU for Linux
khanhntd Aug 26, 2022
55c181e
Add basic NVIDIA GPU for Linux
khanhntd Aug 26, 2022
4886754
Add basic check file control for windows
khanhntd Aug 27, 2022
cfdc00d
Add basic nvidia check
khanhntd Aug 28, 2022
0822f9c
Finish security check for linux and macos
khanhntd Aug 28, 2022
5b7e3a2
Finish basic nvidia_gpu for security checking
khanhntd Aug 28, 2022
cfae917
Add support for linux al2
khanhntd Aug 30, 2022
1408e45
Separeate function between windows and linux
khanhntd Aug 30, 2022
a84073b
Separeate function between windows and linux--amend
khanhntd Aug 30, 2022
417c861
Merge conflict from master
khanhntd Aug 30, 2022
7765505
Add basic windows util
khanhntd Aug 30, 2022
88f2b68
Merge conflict from upstream
khanhntd Aug 30, 2022
6995f08
Add basic function for windows--amend
khanhntd Aug 30, 2022
f7ae220
Add basic function for windows--amend
khanhntd Aug 30, 2022
42aafd0
Add basic function for windows--amend
khanhntd Aug 30, 2022
e27a144
Add basic function for windows--amend
khanhntd Aug 30, 2022
4739146
Add basic function for windows--amend
khanhntd Aug 30, 2022
66fd548
Add basic function for windows--amend
khanhntd Aug 30, 2022
7e30019
Add basic function for windows--amend
khanhntd Aug 30, 2022
9a79d75
Add basic function for windows--amend
khanhntd Aug 30, 2022
f30b922
Add basic function for windows--amend
khanhntd Aug 31, 2022
d6c689e
Add basic function for windows--amend
khanhntd Aug 31, 2022
5d3c353
Add basic function for windows--amend
khanhntd Aug 31, 2022
c0eb801
Add basic function for windows--amend
khanhntd Aug 31, 2022
a0fc79e
Add basic function for windows--amend
khanhntd Aug 31, 2022
e10899c
Add basic function for windows--amend
khanhntd Aug 31, 2022
0bb0ae4
Add basic function for windows--amend
khanhntd Aug 31, 2022
3c44ff8
Add support for linux al2
khanhntd Aug 31, 2022
1409f20
Change to user data
khanhntd Sep 1, 2022
7d2f41f
Basic OPENSSH
khanhntd Sep 2, 2022
c4901e0
Basic OPENSSH
khanhntd Sep 2, 2022
1b7112f
Return to test integration test on github
khanhntd Sep 2, 2022
8cccf7f
Return to test integration test on github--amend
khanhntd Sep 2, 2022
9903b25
Return to test integration test on github--amend
khanhntd Sep 2, 2022
750e32b
Return to test integration test on github--amend
khanhntd Sep 2, 2022
52393e0
Return to test integration test on github--amend
khanhntd Sep 2, 2022
2241545
Basic OPENSSH
khanhntd Sep 2, 2022
517832a
Basic OPENSSH
khanhntd Sep 2, 2022
16d76e4
Finish NVIDIA GPU--amend
khanhntd Sep 3, 2022
5f43e6f
Finish NVIDIA GPU--amend
khanhntd Sep 3, 2022
97d8e46
Finish NVIDIA GPU--amend
khanhntd Sep 3, 2022
0f2fe01
Finish NVIDIA GPU--amend
khanhntd Sep 3, 2022
2f5702a
Finish NVIDIA GPU--amend
khanhntd Sep 3, 2022
9ed05b3
Finish NVIDIA GPU--amend
khanhntd Sep 3, 2022
3c59d44
Finish NVIDIA GPU
khanhntd Sep 3, 2022
1500ca9
revert some gpg
khanhntd Sep 6, 2022
8af0757
Finish security checking and nvida gpu
khanhntd Sep 6, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 110 additions & 6 deletions .github/workflows/integrationTest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ jobs:
echo "${GPG_PRIVATE_KEY}" | gpg --batch --import -
for f in $(find build/bin/); do if [ ! -d $f ]; then echo "Signing file $f" && echo "${PASSPHRASE}" | gpg --detach-sign --passphrase-fd 0 --batch --default-key "${GPG_KEY_NAME}" $f ; fi ; done


- name: Upload to s3
if: steps.cached_binaries.outputs.cache-hit != 'true'
run: aws s3 cp build/bin s3://${S3_INTEGRATION_BUCKET}/integration-test/binary/${{ github.sha }} --recursive
Expand Down Expand Up @@ -118,10 +119,12 @@ jobs:
${{ steps.login-ecr.outputs.registry }}/${{ env.ECR_INTEGRATION_TEST_REPO }}:${{ github.sha }}
platforms: linux/amd64, linux/arm64


GenerateTestMatrix:
name: 'GenerateTestMatrix'
runs-on: ubuntu-latest
outputs:
ec2_gpu_matrix: ${{ steps.set-matrix.outputs.ec2_gpu_matrix }}
ec2_linux_matrix: ${{ steps.set-matrix.outputs.ec2_linux_matrix }}
ec2_performance_matrix: ${{steps.set-matrix.outputs.ec2_performance_matrix}}
ec2_windows_matrix: ${{ steps.set-matrix.outputs.ec2_windows_matrix }}
Expand All @@ -138,17 +141,19 @@ jobs:
id: set-matrix
run: |
go run --tags=generator integration/generator/test_case_generator.go
echo "::set-output name=ec2_gpu_matrix::$(echo $(cat integration/generator/resources/ec2_gpu_complete_test_matrix.json))"
echo "::set-output name=ec2_linux_matrix::$(echo $(cat integration/generator/resources/ec2_linux_complete_test_matrix.json))"
echo "::set-output name=ec2_performance_matrix::$(echo $(cat integration/generator/resources/ec2_performance_complete_test_matrix.json))"
echo "::set-output name=ec2_windows_matrix::$(echo $(cat integration/generator/resources/ec2_windows_complete_test_matrix.json))"
echo "::set-output name=ecs_fargate_matrix::$(echo $(cat integration/generator/resources/ecs_fargate_complete_test_matrix.json))"

- name: Echo test plan matrix
run: |
echo ${{ steps.set-matrix.outputs.ec2_linux_matrix }}
echo ${{ steps.set-matrix.outputs.ec2_performance_matrix}}
echo ${{ steps.set-matrix.outputs.ec2_windows_matrix }}
echo ${{ steps.set-matrix.outputs.ecs_fargate_matrix }}
echo "ec2_gpu_matrix: ${{ steps.set-matrix.outputs.ec2_gpu_matrix }}"
echo "ec2_linux_matrix: ${{ steps.set-matrix.outputs.ec2_linux_matrix }}"
echo "ec2_performance_matrix: ${{ steps.set-matrix.outputs.ec2_performance_matrix}}"
echo "ec2_windows_matrix: ${{ steps.set-matrix.outputs.ec2_windows_matrix }}"
echo "ecs_fargate_matrix${{ steps.set-matrix.outputs.ecs_fargate_matrix }}"

MakeMSIZip:
name: 'MakeMSIZip'
Expand Down Expand Up @@ -382,6 +387,105 @@ jobs:
echo "::set-output name=local_stack_host_name::$LOCAL_STACK_HOST_NAME" &&
aws s3 cp terraform.tfstate s3://${S3_INTEGRATION_BUCKET}/integration-test/local-stack-terraform-state/${GITHUB_SHA}/terraform.tfstate

EC2NvidiaGPUIntegrationTest:
needs: [ MakeBinary, BuildMSI, StartLocalStack, GenerateTestMatrix ]
name: 'EC2NVIDIAGPUIntegrationTest'
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
arrays: ${{ fromJson(needs.GenerateTestMatrix.outputs.ec2_gpu_matrix) }}
permissions:
id-token: write
contents: read
steps:
- uses: actions/checkout@v2

- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v1
with:
role-to-assume: ${{ env.TERRAFORM_AWS_ASSUME_ROLE }}
aws-region: us-west-2

- name: Cache if success
id: ec2-linux-integration-test
uses: actions/cache@v2
with:
path: go.mod
key: ec2-nvidia-integration-test-${{ github.sha }}-${{ matrix.arrays.os }}-${{ matrix.arrays.arc }}-${{ matrix.arrays.test_dir }}

- name: Echo Test Info
run: echo run on ec2 instance os ${{ matrix.arrays.os }} arc ${{ matrix.arrays.arc }} test dir ${{ matrix.arrays.test_dir }}

- name: Verify Terraform version
run: terraform --version

# nick-invision/retry@v2 starts at base dir
- name: Terraform apply
if: ${{ matrix.arrays.family == 'linux' && steps.ec2-nvidia-integration-test.outputs.cache-hit != 'true' }}
uses: nick-invision/retry@v2
with:
max_attempts: 3
timeout_minutes: 30
retry_wait_seconds: 5
command: |
cd integration/terraform/ec2/linux
terraform init
if terraform apply --auto-approve \
-var="ssh_key_value=${PRIVATE_KEY}" -var="github_repo=${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}.git" \
-var="github_sha=${GITHUB_SHA}" -var="install_agent=${{ matrix.arrays.installAgentCommand }}" \
-var="ec2_instance_type=${{ matrix.arrays.instanceType }}" \
-var="user=${{ matrix.arrays.username }}" \
-var="ami=${{ matrix.arrays.ami }}" \
-var="ca_cert_path=${{ matrix.arrays.caCertPath }}" \
-var="arc=${{ matrix.arrays.arc }}" \
-var="binary_name=${{ matrix.arrays.binaryName }}" \
-var="local_stack_host_name=${{ needs.StartLocalStack.outputs.local_stack_host_name }}" \
-var="s3_bucket=${S3_INTEGRATION_BUCKET}" \
-var="ssh_key_name=${KEY_NAME}" \
-var="test_name=cw-integ-test-${{ matrix.arrays.os }}" \
-var="test_dir=${{ matrix.arrays.test_dir }}" ; then terraform destroy -auto-approve
else
terraform destroy -auto-approve && exit 1
fi

- name: Terraform apply
if: ${{ matrix.arrays.family == 'window' && steps.ec2-nvidia-integration-test.outputs.cache-hit != 'true' }}
uses: nick-invision/retry@v2
with:
max_attempts: 3
timeout_minutes: 30
retry_wait_seconds: 5
command: |
cd integration/terraform/ec2/win
terraform init
if terraform apply --auto-approve \
-var="ssh_key_value=${PRIVATE_KEY}" -var="ssh_key_name=${KEY_NAME}" \
-var="github_repo=${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}.git" \
-var="github_sha=${GITHUB_SHA}" -var="ami=${{ matrix.arrays.ami }}" \
-var="test_dir=${{ matrix.arrays.test_dir }}" \
-var="ec2_instance_type=${{ matrix.arrays.instanceType }}" \
-var="s3_bucket=${S3_INTEGRATION_BUCKET}" ; then terraform destroy -auto-approve
else
terraform destroy -auto-approve && exit 1
fi

#This is here just in case workflow cancel
- name: Terraform destroy
if: ${{ cancelled() && steps.ec2-nvidia-integration-test.outputs.cache-hit != 'true' }}
uses: nick-invision/retry@v2
with:
max_attempts: 3
timeout_minutes: 8
retry_wait_seconds: 5
command: |
if "${{ matrix.arrays.os }}" == window
cd integration/terraform/ec2/win
else
cd integration/terraform/ec2/linux
fi
terraform destroy --auto-approve

EC2LinuxIntegrationTest:
needs: [MakeBinary, StartLocalStack, GenerateTestMatrix]
name: 'EC2LinuxIntegrationTest'
Expand Down Expand Up @@ -502,7 +606,7 @@ jobs:
-var="ssh_key_value=${PRIVATE_KEY}" -var="ssh_key_name=${KEY_NAME}" \
-var="github_repo=${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}.git" \
-var="github_sha=${GITHUB_SHA}" -var="ami=${{ matrix.arrays.ami }}" \
-var="test_name=${{ matrix.arrays.os }}" \
-var="test_dir=${{ matrix.arrays.test_dir }}" \
-var="s3_bucket=${S3_INTEGRATION_BUCKET}" ; then
terraform destroy -auto-approve
else
Expand All @@ -517,7 +621,7 @@ jobs:
max_attempts: 3
timeout_minutes: 8
retry_wait_seconds: 5
command: cd cd integration/terraform/ec2/win && terraform destroy --auto-approve -var="ami=${{ matrix.arrays.ami }}"
command: cd integration/terraform/ec2/win && terraform destroy --auto-approve -var="ami=${{ matrix.arrays.ami }}"


StopLocalStack:
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -288,4 +288,4 @@ dockerized-build:

# Use vendor instead of proxy when building w/ vendor folder
dockerized-build-vendor:
$(DOCKER_BUILD_FROM_SOURCE) --build-arg GO111MODULE=off .
$(DOCKER_BUILD_FROM_SOURCE) --build-arg GO111MODULE=off .
137 changes: 86 additions & 51 deletions cmd/amazon-cloudwatch-agent/amazon-cloudwatch-agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ import (
"github.com/aws/amazon-cloudwatch-agent/cfg/migrate"
"github.com/aws/amazon-cloudwatch-agent/logs"
"github.com/aws/amazon-cloudwatch-agent/profiler"

"github.com/aws/amazon-cloudwatch-agent/cmd/amazon-cloudwatch-agent/internal"
_ "github.com/aws/amazon-cloudwatch-agent/plugins"

"github.com/influxdata/telegraf/agent"
"github.com/influxdata/telegraf/config"
"github.com/influxdata/telegraf/logger"

//_ "github.com/influxdata/telegraf/plugins/aggregators/all"
"github.com/influxdata/telegraf/plugins/inputs"
//_ "github.com/influxdata/telegraf/plugins/inputs/all"
Expand Down Expand Up @@ -236,58 +236,16 @@ func runAgent(ctx context.Context,
c.OutputFilters = outputFilters
c.InputFilters = inputFilters

isOld, err := migrate.IsOldConfig(*fConfig)
if err != nil {
log.Printf("W! Failed to detect if config file is old format: %v", err)
}

if isOld {
migratedConfFile, err := migrate.MigrateFile(*fConfig)
if err != nil {
log.Printf("W! Failed to migrate old config format file %v: %v", *fConfig, err)
}

err = c.LoadConfig(migratedConfFile)
if err != nil {
return err
}

agentinfo.BuildStr += "_M"
} else {
err = c.LoadConfig(*fConfig)
if err != nil {
return err
}
}

if *fConfigDirectory != "" {
err = c.LoadDirectory(*fConfigDirectory)
if err != nil {
return err
}
}
if !*fTest && len(c.Outputs) == 0 {
return errors.New("Error: no outputs found, did you provide a valid config file?")
}
if len(c.Inputs) == 0 {
return errors.New("Error: no inputs found, did you provide a valid config file?")
}
err = loadTomlConfigIntoAgent(c)

if int64(c.Agent.Interval) <= 0 {
return fmt.Errorf("Agent interval must be positive, found %v",
c.Agent.Interval)
if err != nil {
return err
}

if int64(c.Agent.FlushInterval) <= 0 {
return fmt.Errorf("Agent flush_interval must be positive; found %v",
c.Agent.FlushInterval)
}
err = validateAgentFinalConfigAndPlugins(c)

if *fSchemaTest {
//up to this point, the given config file must be valid
fmt.Println(agentinfo.FullVersion())
fmt.Printf("The given config: %v is valid\n", *fConfig)
os.Exit(0)
if err != nil {
return err
}

ag, err := agent.NewAgent(c)
Expand Down Expand Up @@ -507,7 +465,7 @@ func main() {
}
return
}

if runtime.GOOS == "windows" && windowsRunAsService() {
programFiles := os.Getenv("ProgramFiles")
if programFiles == "" { // Should never happen
Expand Down Expand Up @@ -584,3 +542,80 @@ func windowsRunAsService() bool {

return !service.Interactive()
}

func loadTomlConfigIntoAgent(c *config.Config) error{
isOld, err := migrate.IsOldConfig(*fConfig)
if err != nil {
log.Printf("W! Failed to detect if config file is old format: %v", err)
}

if isOld {
migratedConfFile, err := migrate.MigrateFile(*fConfig)
if err != nil {
log.Printf("W! Failed to migrate old config format file %v: %v", *fConfig, err)
}

err = c.LoadConfig(migratedConfFile)
if err != nil {
return err
}

agentinfo.BuildStr += "_M"
} else {
err = c.LoadConfig(*fConfig)
if err != nil {
return err
}
}

if *fConfigDirectory != "" {
err = c.LoadDirectory(*fConfigDirectory)
if err != nil {
return err
}
}

return nil
}

func validateAgentFinalConfigAndPlugins(c *config.Config) error{
if !*fTest && len(c.Outputs) == 0 {
return errors.New("Error: no outputs found, did you provide a valid config file?")
}
if len(c.Inputs) == 0 {
return errors.New("Error: no inputs found, did you provide a valid config file?")
}

if int64(c.Agent.Interval) <= 0 {
return fmt.Errorf("Agent interval must be positive, found %v", c.Agent.Interval)
}

if int64(c.Agent.FlushInterval) <= 0 {
return fmt.Errorf("Agent flush_interval must be positive; found %v", c.Agent.FlushInterval)
}

if inputPlugin, err := checkRightForBinariesFileWithInputPlugins(c.InputNames()); err != nil {
return fmt.Errorf("Validate input plugin %s failed because of %v", inputPlugin, err)
}

if *fSchemaTest {
//up to this point, the given config file must be valid
fmt.Println(agentinfo.FullVersion())
fmt.Printf("The given config: %v is valid\n", *fConfig)
os.Exit(0)
}

return nil
}

func checkRightForBinariesFileWithInputPlugins(inputPlugins []string) (string, error) {
for _, inputPlugin := range inputPlugins {
if inputPlugin == "nvidia_smi" {
if err := internal.CheckNvidiaSMIBinaryRights(); err != nil {
return "nvidia_smi", err
}
}
}

return "", nil
}
20 changes: 20 additions & 0 deletions cmd/amazon-cloudwatch-agent/internal/check_plugins_unix.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: MIT

//go:build !windows
// +build !windows

package internal

import (
"github.com/aws/amazon-cloudwatch-agent/internal/util/security"
"github.com/aws/amazon-cloudwatch-agent/translator/translate/metrics/util"
)


func CheckNvidiaSMIBinaryRights() error {
if err := security.CheckFileRights(util.Default_Unix_Smi_Path); err != nil{
return err
}
return nil
}
19 changes: 19 additions & 0 deletions cmd/amazon-cloudwatch-agent/internal/check_plugins_windows.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: MIT

//go:build windows
// +build windows

package internal

import (
"github.com/aws/amazon-cloudwatch-agent/internal/util/security"
"github.com/aws/amazon-cloudwatch-agent/translator/translate/metrics/util"
)

func CheckNvidiaSMIBinaryRights() error {
if err := security.CheckFileRights(util.Default_Windows_Smi_Path); err != nil{
return err
}
return nil
}
Loading