From ab4b3f027663a10dea1435a24995e225f0816b4e Mon Sep 17 00:00:00 2001 From: Khanh Nguyen <91758108+khanhntd@users.noreply.github.com> Date: Thu, 8 Sep 2022 16:45:07 -0400 Subject: [PATCH] Add NVIDIA GPU Integration Test and Security Checking for binaries (#579) * Change to ec2 linux avance and simple * Add basic NVIDIA GPU for Linux * Add basic check file control for windows * Add basic nvidia check * Finish security check for linux and macos * Finish basic nvidia_gpu for security checking * Add support for linux al2 * Separeate function between windows and linux * Separeate function between windows and linux--amend * Add basic windows util * Add basic function for windows--amend * Add basic function for windows--amend * Add basic function for windows--amend * Add basic function for windows--amend * Add basic function for windows--amend * Add basic function for windows--amend * Add basic function for windows--amend * Add basic function for windows--amend * Add basic function for windows--amend * Add basic function for windows--amend * Add basic function for windows--amend * Add basic function for windows--amend * Add basic function for windows--amend * Add basic function for windows--amend * Add basic function for windows--amend * Add support for linux al2 * Change to user data * Basic OPENSSH * Basic OPENSSH * Return to test integration test on github * Return to test integration test on github--amend * Return to test integration test on github--amend * Return to test integration test on github--amend * Return to test integration test on github--amend * Basic OPENSSH * Basic OPENSSH * Finish NVIDIA GPU--amend * Finish NVIDIA GPU--amend * Finish NVIDIA GPU--amend * Finish NVIDIA GPU--amend * Finish NVIDIA GPU--amend * Finish NVIDIA GPU--amend * Finish NVIDIA GPU * revert some gpg * Finish security checking and nvida gpu Co-authored-by: Ameen --- .github/workflows/integrationTest.yml | 115 ++++++++++++++- Makefile | 2 +- .../amazon-cloudwatch-agent.go | 137 +++++++++++------- .../internal/check_plugins_unix.go | 20 +++ .../internal/check_plugins_windows.go | 19 +++ .../resources/ec2_gpu_test_matrix.json | 19 +++ .../resources/ec2_windows_test_matrix.json | 5 +- integration/generator/test_case_generator.go | 3 + integration/terraform/ec2/linux/variables.tf | 8 +- integration/terraform/ec2/linux/vpc.tf | 3 +- .../terraform/ec2/localstack/providers.tf | 2 +- .../terraform/ec2/localstack/variables.tf | 14 +- integration/terraform/ec2/localstack/vpc.tf | 3 +- integration/terraform/ec2/win/main.tf | 38 ++++- integration/terraform/ec2/win/variables.tf | 12 +- integration/terraform/ec2/win/vpc.tf | 9 +- integration/terraform/ecs/linux/README.md | 8 +- integration/terraform/ecs/linux/vpc.tf | 3 +- .../{agent_util.go => agent_util_linux.go} | 54 +------ integration/test/agent_util_windows.go | 101 +++++++++++++ integration/test/cwm_util.go | 100 +++++++++++++ .../metrics_number_dimension_test.go | 25 +--- .../metrics_nvidia_gpu_linux_test.go | 53 +++++++ .../metrics_nvidia_gpu_window_test.go | 60 ++++++++ .../nvidia_gpu/resources/config_linux.json | 30 ++++ .../nvidia_gpu/resources/config_windows.json | 25 ++++ .../test/performancetest/transmitter.go | 1 + integration/test/sanity/sanity_windows.go | 2 +- integration/test/util.go | 31 ++++ internal/util/security/unix_permission.go | 51 +++++++ internal/util/security/windows_permission.go | 109 ++++++++++++++ internal/util/security/windows_sec.go | 109 ++++++++++++++ .../metrics/util/commonconfigutil.go | 2 +- .../translate/metrics/util/measurementutil.go | 46 +++--- 34 files changed, 1028 insertions(+), 191 deletions(-) create mode 100644 cmd/amazon-cloudwatch-agent/internal/check_plugins_unix.go create mode 100644 cmd/amazon-cloudwatch-agent/internal/check_plugins_windows.go create mode 100644 integration/generator/resources/ec2_gpu_test_matrix.json rename integration/test/{agent_util.go => agent_util_linux.go} (64%) create mode 100644 integration/test/agent_util_windows.go create mode 100644 integration/test/cwm_util.go create mode 100644 integration/test/nvidia_gpu/metrics_nvidia_gpu_linux_test.go create mode 100644 integration/test/nvidia_gpu/metrics_nvidia_gpu_window_test.go create mode 100644 integration/test/nvidia_gpu/resources/config_linux.json create mode 100644 integration/test/nvidia_gpu/resources/config_windows.json create mode 100644 integration/test/util.go create mode 100644 internal/util/security/unix_permission.go create mode 100644 internal/util/security/windows_permission.go create mode 100644 internal/util/security/windows_sec.go diff --git a/.github/workflows/integrationTest.yml b/.github/workflows/integrationTest.yml index bfc147e4b7..34f5ce12bc 100644 --- a/.github/workflows/integrationTest.yml +++ b/.github/workflows/integrationTest.yml @@ -118,10 +118,12 @@ jobs: ${{ steps.login-ecr.outputs.registry }}/${{ env.ECR_INTEGRATION_TEST_REPO }}:${{ github.sha }} platforms: linux/amd64, linux/arm64 + GenerateTestMatrix: name: 'GenerateTestMatrix' runs-on: ubuntu-latest outputs: + ec2_gpu_matrix: ${{ steps.set-matrix.outputs.ec2_gpu_matrix }} ec2_linux_matrix: ${{ steps.set-matrix.outputs.ec2_linux_matrix }} ec2_performance_matrix: ${{steps.set-matrix.outputs.ec2_performance_matrix}} ec2_windows_matrix: ${{ steps.set-matrix.outputs.ec2_windows_matrix }} @@ -138,6 +140,7 @@ jobs: id: set-matrix run: | go run --tags=generator integration/generator/test_case_generator.go + echo "::set-output name=ec2_gpu_matrix::$(echo $(cat integration/generator/resources/ec2_gpu_complete_test_matrix.json))" echo "::set-output name=ec2_linux_matrix::$(echo $(cat integration/generator/resources/ec2_linux_complete_test_matrix.json))" echo "::set-output name=ec2_performance_matrix::$(echo $(cat integration/generator/resources/ec2_performance_complete_test_matrix.json))" echo "::set-output name=ec2_windows_matrix::$(echo $(cat integration/generator/resources/ec2_windows_complete_test_matrix.json))" @@ -145,10 +148,11 @@ jobs: - name: Echo test plan matrix run: | - echo ${{ steps.set-matrix.outputs.ec2_linux_matrix }} - echo ${{ steps.set-matrix.outputs.ec2_performance_matrix}} - echo ${{ steps.set-matrix.outputs.ec2_windows_matrix }} - echo ${{ steps.set-matrix.outputs.ecs_fargate_matrix }} + echo "ec2_gpu_matrix: ${{ steps.set-matrix.outputs.ec2_gpu_matrix }}" + echo "ec2_linux_matrix: ${{ steps.set-matrix.outputs.ec2_linux_matrix }}" + echo "ec2_performance_matrix: ${{ steps.set-matrix.outputs.ec2_performance_matrix}}" + echo "ec2_windows_matrix: ${{ steps.set-matrix.outputs.ec2_windows_matrix }}" + echo "ecs_fargate_matrix${{ steps.set-matrix.outputs.ecs_fargate_matrix }}" MakeMSIZip: name: 'MakeMSIZip' @@ -382,6 +386,105 @@ jobs: echo "::set-output name=local_stack_host_name::$LOCAL_STACK_HOST_NAME" && aws s3 cp terraform.tfstate s3://${S3_INTEGRATION_BUCKET}/integration-test/local-stack-terraform-state/${GITHUB_SHA}/terraform.tfstate + EC2NvidiaGPUIntegrationTest: + needs: [ MakeBinary, BuildMSI, StartLocalStack, GenerateTestMatrix ] + name: 'EC2NVIDIAGPUIntegrationTest' + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + arrays: ${{ fromJson(needs.GenerateTestMatrix.outputs.ec2_gpu_matrix) }} + permissions: + id-token: write + contents: read + steps: + - uses: actions/checkout@v2 + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v1 + with: + role-to-assume: ${{ env.TERRAFORM_AWS_ASSUME_ROLE }} + aws-region: us-west-2 + + - name: Cache if success + id: ec2-linux-integration-test + uses: actions/cache@v2 + with: + path: go.mod + key: ec2-nvidia-integration-test-${{ github.sha }}-${{ matrix.arrays.os }}-${{ matrix.arrays.arc }}-${{ matrix.arrays.test_dir }} + + - name: Echo Test Info + run: echo run on ec2 instance os ${{ matrix.arrays.os }} arc ${{ matrix.arrays.arc }} test dir ${{ matrix.arrays.test_dir }} + + - name: Verify Terraform version + run: terraform --version + + # nick-invision/retry@v2 starts at base dir + - name: Terraform apply + if: ${{ matrix.arrays.family == 'linux' && steps.ec2-nvidia-integration-test.outputs.cache-hit != 'true' }} + uses: nick-invision/retry@v2 + with: + max_attempts: 3 + timeout_minutes: 30 + retry_wait_seconds: 5 + command: | + cd integration/terraform/ec2/linux + terraform init + if terraform apply --auto-approve \ + -var="ssh_key_value=${PRIVATE_KEY}" -var="github_repo=${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}.git" \ + -var="github_sha=${GITHUB_SHA}" -var="install_agent=${{ matrix.arrays.installAgentCommand }}" \ + -var="ec2_instance_type=${{ matrix.arrays.instanceType }}" \ + -var="user=${{ matrix.arrays.username }}" \ + -var="ami=${{ matrix.arrays.ami }}" \ + -var="ca_cert_path=${{ matrix.arrays.caCertPath }}" \ + -var="arc=${{ matrix.arrays.arc }}" \ + -var="binary_name=${{ matrix.arrays.binaryName }}" \ + -var="local_stack_host_name=${{ needs.StartLocalStack.outputs.local_stack_host_name }}" \ + -var="s3_bucket=${S3_INTEGRATION_BUCKET}" \ + -var="ssh_key_name=${KEY_NAME}" \ + -var="test_name=cw-integ-test-${{ matrix.arrays.os }}" \ + -var="test_dir=${{ matrix.arrays.test_dir }}" ; then terraform destroy -auto-approve + else + terraform destroy -auto-approve && exit 1 + fi + + - name: Terraform apply + if: ${{ matrix.arrays.family == 'window' && steps.ec2-nvidia-integration-test.outputs.cache-hit != 'true' }} + uses: nick-invision/retry@v2 + with: + max_attempts: 3 + timeout_minutes: 30 + retry_wait_seconds: 5 + command: | + cd integration/terraform/ec2/win + terraform init + if terraform apply --auto-approve \ + -var="ssh_key_value=${PRIVATE_KEY}" -var="ssh_key_name=${KEY_NAME}" \ + -var="github_repo=${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}.git" \ + -var="github_sha=${GITHUB_SHA}" -var="ami=${{ matrix.arrays.ami }}" \ + -var="test_dir=${{ matrix.arrays.test_dir }}" \ + -var="ec2_instance_type=${{ matrix.arrays.instanceType }}" \ + -var="s3_bucket=${S3_INTEGRATION_BUCKET}" ; then terraform destroy -auto-approve + else + terraform destroy -auto-approve && exit 1 + fi + + #This is here just in case workflow cancel + - name: Terraform destroy + if: ${{ cancelled() && steps.ec2-nvidia-integration-test.outputs.cache-hit != 'true' }} + uses: nick-invision/retry@v2 + with: + max_attempts: 3 + timeout_minutes: 8 + retry_wait_seconds: 5 + command: | + if "${{ matrix.arrays.os }}" == window + cd integration/terraform/ec2/win + else + cd integration/terraform/ec2/linux + fi + terraform destroy --auto-approve + EC2LinuxIntegrationTest: needs: [MakeBinary, StartLocalStack, GenerateTestMatrix] name: 'EC2LinuxIntegrationTest' @@ -502,7 +605,7 @@ jobs: -var="ssh_key_value=${PRIVATE_KEY}" -var="ssh_key_name=${KEY_NAME}" \ -var="github_repo=${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}.git" \ -var="github_sha=${GITHUB_SHA}" -var="ami=${{ matrix.arrays.ami }}" \ - -var="test_name=${{ matrix.arrays.os }}" \ + -var="test_dir=${{ matrix.arrays.test_dir }}" \ -var="s3_bucket=${S3_INTEGRATION_BUCKET}" ; then terraform destroy -auto-approve else @@ -517,7 +620,7 @@ jobs: max_attempts: 3 timeout_minutes: 8 retry_wait_seconds: 5 - command: cd cd integration/terraform/ec2/win && terraform destroy --auto-approve -var="ami=${{ matrix.arrays.ami }}" + command: cd integration/terraform/ec2/win && terraform destroy --auto-approve -var="ami=${{ matrix.arrays.ami }}" StopLocalStack: diff --git a/Makefile b/Makefile index ffc6fd727d..c680fa851e 100644 --- a/Makefile +++ b/Makefile @@ -237,4 +237,4 @@ dockerized-build: # Use vendor instead of proxy when building w/ vendor folder dockerized-build-vendor: - $(DOCKER_BUILD_FROM_SOURCE) --build-arg GO111MODULE=off . + $(DOCKER_BUILD_FROM_SOURCE) --build-arg GO111MODULE=off . \ No newline at end of file diff --git a/cmd/amazon-cloudwatch-agent/amazon-cloudwatch-agent.go b/cmd/amazon-cloudwatch-agent/amazon-cloudwatch-agent.go index a2b7e25812..2ee22f134c 100644 --- a/cmd/amazon-cloudwatch-agent/amazon-cloudwatch-agent.go +++ b/cmd/amazon-cloudwatch-agent/amazon-cloudwatch-agent.go @@ -30,12 +30,12 @@ import ( "github.com/aws/amazon-cloudwatch-agent/cfg/migrate" "github.com/aws/amazon-cloudwatch-agent/logs" "github.com/aws/amazon-cloudwatch-agent/profiler" - + "github.com/aws/amazon-cloudwatch-agent/cmd/amazon-cloudwatch-agent/internal" _ "github.com/aws/amazon-cloudwatch-agent/plugins" + "github.com/influxdata/telegraf/agent" "github.com/influxdata/telegraf/config" "github.com/influxdata/telegraf/logger" - //_ "github.com/influxdata/telegraf/plugins/aggregators/all" "github.com/influxdata/telegraf/plugins/inputs" //_ "github.com/influxdata/telegraf/plugins/inputs/all" @@ -236,58 +236,16 @@ func runAgent(ctx context.Context, c.OutputFilters = outputFilters c.InputFilters = inputFilters - isOld, err := migrate.IsOldConfig(*fConfig) - if err != nil { - log.Printf("W! Failed to detect if config file is old format: %v", err) - } - - if isOld { - migratedConfFile, err := migrate.MigrateFile(*fConfig) - if err != nil { - log.Printf("W! Failed to migrate old config format file %v: %v", *fConfig, err) - } - - err = c.LoadConfig(migratedConfFile) - if err != nil { - return err - } - - agentinfo.BuildStr += "_M" - } else { - err = c.LoadConfig(*fConfig) - if err != nil { - return err - } - } - - if *fConfigDirectory != "" { - err = c.LoadDirectory(*fConfigDirectory) - if err != nil { - return err - } - } - if !*fTest && len(c.Outputs) == 0 { - return errors.New("Error: no outputs found, did you provide a valid config file?") - } - if len(c.Inputs) == 0 { - return errors.New("Error: no inputs found, did you provide a valid config file?") - } + err = loadTomlConfigIntoAgent(c) - if int64(c.Agent.Interval) <= 0 { - return fmt.Errorf("Agent interval must be positive, found %v", - c.Agent.Interval) + if err != nil { + return err } - if int64(c.Agent.FlushInterval) <= 0 { - return fmt.Errorf("Agent flush_interval must be positive; found %v", - c.Agent.FlushInterval) - } + err = validateAgentFinalConfigAndPlugins(c) - if *fSchemaTest { - //up to this point, the given config file must be valid - fmt.Println(agentinfo.FullVersion()) - fmt.Printf("The given config: %v is valid\n", *fConfig) - os.Exit(0) + if err != nil { + return err } ag, err := agent.NewAgent(c) @@ -507,7 +465,7 @@ func main() { } return } - + if runtime.GOOS == "windows" && windowsRunAsService() { programFiles := os.Getenv("ProgramFiles") if programFiles == "" { // Should never happen @@ -584,3 +542,80 @@ func windowsRunAsService() bool { return !service.Interactive() } + +func loadTomlConfigIntoAgent(c *config.Config) error{ + isOld, err := migrate.IsOldConfig(*fConfig) + if err != nil { + log.Printf("W! Failed to detect if config file is old format: %v", err) + } + + if isOld { + migratedConfFile, err := migrate.MigrateFile(*fConfig) + if err != nil { + log.Printf("W! Failed to migrate old config format file %v: %v", *fConfig, err) + } + + err = c.LoadConfig(migratedConfFile) + if err != nil { + return err + } + + agentinfo.BuildStr += "_M" + } else { + err = c.LoadConfig(*fConfig) + if err != nil { + return err + } + } + + if *fConfigDirectory != "" { + err = c.LoadDirectory(*fConfigDirectory) + if err != nil { + return err + } + } + + return nil +} + +func validateAgentFinalConfigAndPlugins(c *config.Config) error{ + if !*fTest && len(c.Outputs) == 0 { + return errors.New("Error: no outputs found, did you provide a valid config file?") + } + if len(c.Inputs) == 0 { + return errors.New("Error: no inputs found, did you provide a valid config file?") + } + + if int64(c.Agent.Interval) <= 0 { + return fmt.Errorf("Agent interval must be positive, found %v", c.Agent.Interval) + } + + if int64(c.Agent.FlushInterval) <= 0 { + return fmt.Errorf("Agent flush_interval must be positive; found %v", c.Agent.FlushInterval) + } + + if inputPlugin, err := checkRightForBinariesFileWithInputPlugins(c.InputNames()); err != nil { + return fmt.Errorf("Validate input plugin %s failed because of %v", inputPlugin, err) + } + + if *fSchemaTest { + //up to this point, the given config file must be valid + fmt.Println(agentinfo.FullVersion()) + fmt.Printf("The given config: %v is valid\n", *fConfig) + os.Exit(0) + } + + return nil +} + +func checkRightForBinariesFileWithInputPlugins(inputPlugins []string) (string, error) { + for _, inputPlugin := range inputPlugins { + if inputPlugin == "nvidia_smi" { + if err := internal.CheckNvidiaSMIBinaryRights(); err != nil { + return "nvidia_smi", err + } + } + } + + return "", nil +} diff --git a/cmd/amazon-cloudwatch-agent/internal/check_plugins_unix.go b/cmd/amazon-cloudwatch-agent/internal/check_plugins_unix.go new file mode 100644 index 0000000000..49e7126ba7 --- /dev/null +++ b/cmd/amazon-cloudwatch-agent/internal/check_plugins_unix.go @@ -0,0 +1,20 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +//go:build !windows +// +build !windows + +package internal + +import ( + "github.com/aws/amazon-cloudwatch-agent/internal/util/security" + "github.com/aws/amazon-cloudwatch-agent/translator/translate/metrics/util" +) + + +func CheckNvidiaSMIBinaryRights() error { + if err := security.CheckFileRights(util.Default_Unix_Smi_Path); err != nil{ + return err + } + return nil +} \ No newline at end of file diff --git a/cmd/amazon-cloudwatch-agent/internal/check_plugins_windows.go b/cmd/amazon-cloudwatch-agent/internal/check_plugins_windows.go new file mode 100644 index 0000000000..248555c518 --- /dev/null +++ b/cmd/amazon-cloudwatch-agent/internal/check_plugins_windows.go @@ -0,0 +1,19 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +//go:build windows +// +build windows + +package internal + +import ( + "github.com/aws/amazon-cloudwatch-agent/internal/util/security" + "github.com/aws/amazon-cloudwatch-agent/translator/translate/metrics/util" +) + +func CheckNvidiaSMIBinaryRights() error { + if err := security.CheckFileRights(util.Default_Windows_Smi_Path); err != nil{ + return err + } + return nil +} \ No newline at end of file diff --git a/integration/generator/resources/ec2_gpu_test_matrix.json b/integration/generator/resources/ec2_gpu_test_matrix.json new file mode 100644 index 0000000000..8b5e7f1424 --- /dev/null +++ b/integration/generator/resources/ec2_gpu_test_matrix.json @@ -0,0 +1,19 @@ +[ + { + "os": "al2", + "username": "ec2-user", + "instanceType":"g4dn.xlarge", + "installAgentCommand": "rpm -U ./amazon-cloudwatch-agent.rpm", + "ami": "cloudwatch-agent-integration-test-nvidia-gpu-al2*", + "caCertPath": "/etc/ssl/certs/ca-bundle.crt", + "arc": "amd64", + "binaryName": "amazon-cloudwatch-agent.rpm", + "family": "linux" + }, + { + "os": "win-2019", + "instanceType":"g4dn.xlarge", + "ami": "Windows_Server-2019-English-Deep-Learning*", + "family": "window" + } +] \ No newline at end of file diff --git a/integration/generator/resources/ec2_windows_test_matrix.json b/integration/generator/resources/ec2_windows_test_matrix.json index ae66556fee..7f7dc0d187 100644 --- a/integration/generator/resources/ec2_windows_test_matrix.json +++ b/integration/generator/resources/ec2_windows_test_matrix.json @@ -1,6 +1,7 @@ [ { - "os": "win-2022", - "ami": "cloudwatch-agent-integration-test-win-2022*" + "os": "win-2019", + "instanceType":"g4dn.xlarge", + "ami": "Windows_Server-2019-English-Deep-Learning*" } ] \ No newline at end of file diff --git a/integration/generator/test_case_generator.go b/integration/generator/test_case_generator.go index 6527e150c0..74aafde052 100644 --- a/integration/generator/test_case_generator.go +++ b/integration/generator/test_case_generator.go @@ -20,6 +20,9 @@ const ( //you can't have a const map in golang var osToTestDirMap = map[string][]string{ + "ec2_gpu": { + "./integration/test/nvidia_gpu", + }, "ec2_linux": { "./integration/test/ca_bundle", "./integration/test/cloudwatchlogs", diff --git a/integration/terraform/ec2/linux/variables.tf b/integration/terraform/ec2/linux/variables.tf index 0f3ba5ad31..bbe2daf029 100644 --- a/integration/terraform/ec2/linux/variables.tf +++ b/integration/terraform/ec2/linux/variables.tf @@ -79,11 +79,11 @@ variable "github_repo" { default = "" } -variable "github_sha_date"{ - type = string +variable "github_sha_date" { + type = string default = "" } -variable "performance_number_of_logs"{ - type = string +variable "performance_number_of_logs" { + type = string default = "" } diff --git a/integration/terraform/ec2/linux/vpc.tf b/integration/terraform/ec2/linux/vpc.tf index 6e1f7c58e5..3f41f7a50d 100644 --- a/integration/terraform/ec2/linux/vpc.tf +++ b/integration/terraform/ec2/linux/vpc.tf @@ -10,7 +10,8 @@ data "aws_subnets" "default" { } resource "aws_security_group" "ec2_security_group" { - name = "cwagent-sg-${random_id.testing_id.hex}" + name = "cwagent-sg-${random_id.testing_id.hex}" + vpc_id = data.aws_vpc.default.id egress { from_port = 0 diff --git a/integration/terraform/ec2/localstack/providers.tf b/integration/terraform/ec2/localstack/providers.tf index 19769a7fb3..5ff54f0d65 100644 --- a/integration/terraform/ec2/localstack/providers.tf +++ b/integration/terraform/ec2/localstack/providers.tf @@ -1,3 +1,3 @@ provider "aws" { - region = var.region + region = var.region } \ No newline at end of file diff --git a/integration/terraform/ec2/localstack/variables.tf b/integration/terraform/ec2/localstack/variables.tf index 273e736193..c7682a47cd 100644 --- a/integration/terraform/ec2/localstack/variables.tf +++ b/integration/terraform/ec2/localstack/variables.tf @@ -1,35 +1,35 @@ variable "ec2_instance_type" { - type = string + type = string default = "t3a.xlarge" } variable "ssh_key_name" { - type = string + type = string default = "cwagent-integ-test-key" } variable "region" { - type = string + type = string default = "us-west-2" } variable "ssh_key_value" { - type = string + type = string default = "" } variable "github_sha" { - type = string + type = string default = "" } variable "github_repo" { - type = string + type = string default = "" } variable "s3_bucket" { - type = string + type = string default = "" } diff --git a/integration/terraform/ec2/localstack/vpc.tf b/integration/terraform/ec2/localstack/vpc.tf index 6e1f7c58e5..3f41f7a50d 100644 --- a/integration/terraform/ec2/localstack/vpc.tf +++ b/integration/terraform/ec2/localstack/vpc.tf @@ -10,7 +10,8 @@ data "aws_subnets" "default" { } resource "aws_security_group" "ec2_security_group" { - name = "cwagent-sg-${random_id.testing_id.hex}" + name = "cwagent-sg-${random_id.testing_id.hex}" + vpc_id = data.aws_vpc.default.id egress { from_port = 0 diff --git a/integration/terraform/ec2/win/main.tf b/integration/terraform/ec2/win/main.tf index b6cb0dfdd2..71ef87f3a1 100644 --- a/integration/terraform/ec2/win/main.tf +++ b/integration/terraform/ec2/win/main.tf @@ -36,41 +36,65 @@ resource "aws_instance" "cwagent" { vpc_security_group_ids = [aws_security_group.ec2_security_group.id] associate_public_ip_address = true get_password_data = true + user_data = < +Write-Output "Install OpenSSH and Firewalls which allows port 22 for connection" +Add-WindowsCapability -Online -Name OpenSSH.Client~~~~0.0.1.0 +Add-WindowsCapability -Online -Name OpenSSH.Server~~~~0.0.1.0 + +Start-Service sshd +Set-Service -Name sshd -StartupType 'Automatic' + +[System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072 +Set-ExecutionPolicy Bypass -Scope Process -Force; iex ((New-Object System.Net.WebClient).DownloadString('https://chocolatey.org/install.ps1')) + +choco install git --confirm +choco install go --confirm +msiexec /i https://awscli.amazonaws.com/AWSCLIV2.msi /norestart /qb- + +[Environment]::SetEnvironmentVariable("PATH", "C:\ProgramData\chocolatey\bin;C:\Program Files\Git\cmd;C:\Program Files\Amazon\AWSCLIV2\;C:\Program Files\Go\bin;C:\Windows\System32;C:\Windows\System32\WindowsPowerShell\v1.0\", [System.EnvironmentVariableTarget]::Machine) + +EOF + tags = { - Name = "cwagent-integ-test-ec2-${var.test_name}-${random_id.testing_id.hex}" + Name = "cwagent-integ-test-ec2-windows-${element(split("/", var.test_dir),3)}-${random_id.testing_id.hex}" } } resource "null_resource" "integration_test" { depends_on = [aws_instance.cwagent] + # Install software provisioner "remote-exec" { - # @TODO when @ZhenyuTan-amz adds windows tests add "make integration-test" - # @TODO add export for AWS region from tf vars to make sure runner can use AWS SDK inline = [ + "start /wait timeout 120", //Wait some time to ensure all binaries have been downloaded + "call %ProgramData%\\chocolatey\\bin\\RefreshEnv.cmd", //Reload the environment variables to pull the latest one instead of restarting cmd + "set AWS_REGION=${var.region}", + "aws s3 cp s3://${var.s3_bucket}/integration-test/packaging/${var.github_sha}/amazon-cloudwatch-agent.msi .", + "start /wait msiexec /i amazon-cloudwatch-agent.msi /norestart /qb-", "echo clone and install agent", "git clone ${var.github_repo}", "cd amazon-cloudwatch-agent", "git reset --hard ${var.github_sha}", - "aws s3 cp s3://${var.s3_bucket}/integration-test/packaging/${var.github_sha}/amazon-cloudwatch-agent.msi .", - "msiexec /i amazon-cloudwatch-agent.msi", "echo run tests with the tag integration, one at a time, and verbose", "echo run sanity test && go test ./integration/test/sanity -p 1 -v --tags=integration", + "go test ${var.test_dir} -p 1 -timeout 30m -v --tags=integration " ] connection { type = "ssh" user = "Administrator" - private_key = local.private_key_content password = rsadecrypt(aws_instance.cwagent.password_data, local.private_key_content) host = aws_instance.cwagent.public_ip target_platform = "windows" + timeout = "6m" } } } data "aws_ami" "latest" { most_recent = true - owners = ["self", "506463145083"] + // @Todo: Add back when nvidia_gpu pipeline has been able to produced the AMI + #owners = ["self", "506463145083"] filter { name = "name" diff --git a/integration/terraform/ec2/win/variables.tf b/integration/terraform/ec2/win/variables.tf index b9d6353fa4..7855edcad5 100644 --- a/integration/terraform/ec2/win/variables.tf +++ b/integration/terraform/ec2/win/variables.tf @@ -5,22 +5,22 @@ variable "region" { variable "ec2_instance_type" { type = string - default = "t3a.xlarge" + default = "g4dn.xlarge" } variable "ami" { type = string - default = "cloudwatch-agent-integration-test-win-2022*" + default = "Windows_Server-2019-English-Deep-Learning*" } variable "github_sha" { type = string - default = "aee2f5c9b1b0a7a840b441da37a63ede7506a343" + default = "4cefc9f0e9b411c6765c5122877c136dbb23588d" } variable "github_repo" { type = string - default = "https://github.com/aws/amazon-cloudwatch-agent" + default = "https://github.com/aws/amazon-cloudwatch-agent.git" } variable "ssh_key_name" { @@ -38,7 +38,7 @@ variable "s3_bucket" { default = "" } -variable "test_name" { +variable "test_dir" { type = string - default = "windows-2022" + default = "./integration/test/nvidia_gpu" } diff --git a/integration/terraform/ec2/win/vpc.tf b/integration/terraform/ec2/win/vpc.tf index c4d7725287..33ec7a74ef 100644 --- a/integration/terraform/ec2/win/vpc.tf +++ b/integration/terraform/ec2/win/vpc.tf @@ -10,7 +10,8 @@ data "aws_subnets" "default" { } resource "aws_security_group" "ec2_security_group" { - name = "cwagent-sg-${random_id.testing_id.hex}" + name = "cwagent-sg-${random_id.testing_id.hex}" + vpc_id = data.aws_vpc.default.id egress { from_port = 0 @@ -20,9 +21,9 @@ resource "aws_security_group" "ec2_security_group" { } ingress { - from_port = "22" - to_port = "22" - protocol = "tcp" + from_port = 22 + to_port = 22 + protocol = "TCP" cidr_blocks = ["0.0.0.0/0"] } } \ No newline at end of file diff --git a/integration/terraform/ecs/linux/README.md b/integration/terraform/ecs/linux/README.md index 531c83b440..9590ffb982 100644 --- a/integration/terraform/ecs/linux/README.md +++ b/integration/terraform/ecs/linux/README.md @@ -1,7 +1,9 @@ Running ECS Fargate Integration Tests ========================= +## Prerequisite +* [ECR Repository with the docker image](https://docs.aws.amazon.com/AmazonECR/latest/userguide/getting-started-console.html) -## 1. How ECS Fargate are set up? +## How ECS Fargate are set up? **Step 1:** Create a Fargate ECS Cluster with the default VPC Network. **Step 2:** Create a security group to assign to the service in step 5 which allows all inbound traffics and outbound traffics @@ -12,7 +14,7 @@ to decide which containers serve a specific task and assign the IAM roles in s **Step 5:** Create a [service](https://docs.aws.amazon.com/AmazonECS/latest/developerguide/ecs_services.html) which configure how many tasks are running in parallel and ensure availability of the task. -## 2. Setup resources +## Setup resources By running `terraform apply -auto-approve -lock=false`, you agree to setup the following resources: * 1 IAM Task Role and 1 Execution Task Role (similar to [these IAM Roles](https://docs.aws.amazon.com/AmazonCloudWatch/latest/monitoring/deploy_servicelens_CloudWatch_agent_deploy_ECS.html)) @@ -52,7 +54,7 @@ To be more specifically, * **CloudWatchAgent Parameter Store:** Store CloudWatchAgent's configuration and CloudWatchAgent will pull the config from there. [Example configuration](default_resources/default_amazon_cloudwatch_agent.json) * **Prometheus Parameter Store:** Store Prometheus's configuration and CloudWatchAgent will pull the config from there. [Example configuration](default_resources/default_ecs_prometheus.tpl) -## 3. Run tests in your AWS account +## Run tests in your AWS account ```` cd integration/terraform/ecs && terraform init && terraform apply -auto-approve \ -var="test_dir={{your test case folder name}} \ diff --git a/integration/terraform/ecs/linux/vpc.tf b/integration/terraform/ecs/linux/vpc.tf index c68b9aba3a..40fb0c3ff1 100644 --- a/integration/terraform/ecs/linux/vpc.tf +++ b/integration/terraform/ecs/linux/vpc.tf @@ -10,7 +10,8 @@ data "aws_subnets" "default" { } resource "aws_security_group" "ecs_security_group" { - name = "cwagent-sg-${random_id.testing_id.hex}" + name = "cwagent-sg-${random_id.testing_id.hex}" + vpc_id = data.aws_vpc.default.id egress { from_port = 0 diff --git a/integration/test/agent_util.go b/integration/test/agent_util_linux.go similarity index 64% rename from integration/test/agent_util.go rename to integration/test/agent_util_linux.go index 3ade7a29eb..51bfe6ec69 100644 --- a/integration/test/agent_util.go +++ b/integration/test/agent_util_linux.go @@ -1,22 +1,17 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: MIT -//go:build integration -// +build integration +//go:build linux && integration +// +build linux,integration package test import ( - "context" "fmt" "log" "os/exec" "path/filepath" "time" - - "github.com/aws/aws-sdk-go-v2/config" - "github.com/aws/aws-sdk-go-v2/feature/ec2/imds" - "github.com/aws/aws-sdk-go-v2/service/cloudwatch" ) func CopyFile(pathIn string, pathOut string) { @@ -92,31 +87,13 @@ func RunShellScript(path string, args ...string) error{ out, err = exec.Command("bash", bashArgs...).Output() if err != nil { - log.Fatalf("Error occurred when executing %s: %s | %s", path, err.Error(), string(out)) + log.Printf("Error occurred when executing %s: %s | %s", path, err.Error(), string(out)) return err } return nil } -func RunPowerShellScript(path string, args ...string) error{ - ps, err := exec.LookPath("powershell.exe") - - if err != nil { - return err - } - - bashArgs := append([]string{"-NoProfile", "-NonInteractive", "-NoExit", path}, args...) - out, err := exec.Command(ps, bashArgs...).Output() - - if err != nil { - log.Fatalf("Error occurred when executing %s: %s | %s", path, err.Error(), string(out)) - return err - } - - return nil -} - func RunCommand(cmd string) { out, err := exec.Command("bash", "-c", cmd).Output() @@ -132,28 +109,3 @@ func ReplaceLocalStackHostName(pathIn string) { log.Fatal(fmt.Sprint(err) + string(out)) } } - -func GetInstanceId() string { - ctx := context.Background() - c, err := config.LoadDefaultConfig(ctx) - if err != nil { - // fail fast so we don't continue the test - log.Fatalf("Error occurred while creating SDK config: %v", err) - } - - // TODO: this only works for EC2 based testing - client := imds.NewFromConfig(c) - metadata, err := client.GetInstanceIdentityDocument(ctx, &imds.GetInstanceIdentityDocumentInput{}) - if err != nil { - log.Fatalf("Error occurred while retrieving EC2 instance ID: %v", err) - } - return metadata.InstanceID -} - -func GetCWClient(cxt context.Context) *cloudwatch.Client { - defaultConfig, err := config.LoadDefaultConfig(cxt) - if err != nil { - log.Fatalf("err occurred while creating config %v", err) - } - return cloudwatch.NewFromConfig(defaultConfig) -} diff --git a/integration/test/agent_util_windows.go b/integration/test/agent_util_windows.go new file mode 100644 index 0000000000..4cc5053559 --- /dev/null +++ b/integration/test/agent_util_windows.go @@ -0,0 +1,101 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +//go:build windows && integration +// +build windows,integration + +package test + +import ( + "os/exec" + "log" + "fmt" + "path/filepath" +) + +func CopyFile(pathIn string, pathOut string) error{ + ps, err := exec.LookPath("powershell.exe") + + if err != nil { + return err + } + + log.Printf("Copy File %s to %s", pathIn, pathOut) + pathInAbs, err := filepath.Abs(pathIn) + + if err != nil { + return err + } + + log.Printf("File %s abs path %s", pathIn, pathInAbs) + bashArgs := append([]string{"-NoProfile", "-NonInteractive", "-NoExit", "cp "+pathInAbs+" "+pathOut}) + out, err := exec.Command(ps, bashArgs...).Output() + + if err != nil { + log.Printf("Copy file failed: %v; the output is: %s",err, string(out)) + return err + } + + log.Printf("File : %s copied to : %s", pathIn, pathOut) + return nil + +} + +func StartAgent(configOutputPath string, fatalOnFailure bool) error { + ps, err := exec.LookPath("powershell.exe") + + if err != nil { + return err + } + + bashArgs := append([]string{"-NoProfile", "-NonInteractive", "-NoExit", "& \"C:\\Program Files\\Amazon\\AmazonCloudWatchAgent\\amazon-cloudwatch-agent-ctl.ps1\" -a fetch-config -m ec2 -s -c file:"+configOutputPath}) + out, err := exec.Command(ps, bashArgs...).Output() + + if err != nil && fatalOnFailure { + log.Printf("Start agent failed: %v; the output is: %s",err, string(out)) + return err + } else if err != nil { + log.Printf(fmt.Sprint(err) + string(out)) + } else { + log.Printf("Agent has started") + } + + return err +} + +func StopAgent() error{ + ps, err := exec.LookPath("powershell.exe") + + if err != nil { + return err + } + + bashArgs := append([]string{"-NoProfile", "-NonInteractive", "-NoExit", "& \"C:\\Program Files\\Amazon\\AmazonCloudWatchAgent\\amazon-cloudwatch-agent-ctl.ps1\" -a stop"}) + out, err := exec.Command(ps, bashArgs...).Output() + + if err != nil { + log.Printf("Stop agent failed: %v; the output is: %s",err, string(out)) + return err + } + + log.Printf("Agent is stopped") + return nil +} + +func RunShellScript(path string, args ...string) error{ + ps, err := exec.LookPath("powershell.exe") + + if err != nil { + return err + } + + bashArgs := append([]string{"-NoProfile", "-NonInteractive", "-NoExit", path}, args...) + out, err := exec.Command(ps, bashArgs...).Output() + + if err != nil { + log.Printf("Error occurred when executing %s: %s | %s", path, err.Error(), string(out)) + return err + } + + return nil +} \ No newline at end of file diff --git a/integration/test/cwm_util.go b/integration/test/cwm_util.go new file mode 100644 index 0000000000..4d977a9a2a --- /dev/null +++ b/integration/test/cwm_util.go @@ -0,0 +1,100 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +//go:build integration +// +build integration + +package test + +import ( + "context" + "fmt" + "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/service/cloudwatch" + "github.com/aws/aws-sdk-go-v2/service/cloudwatch/types" + "github.com/aws/aws-sdk-go/aws" + "testing" +) + +var ( + metricsCtx context.Context + cwm *cloudwatch.Client +) + +const ( + instanceId = "InstanceId" + appendMetric = "append" + loremIpsum = "Lorem ipsum dolor sit amet consectetur adipiscing elit Vivamus non mauris malesuada mattis ex eget porttitor purus Suspendisse potenti Praesent vel sollicitudin ipsum Quisque luctus pretium lorem non faucibus Ut vel quam dui Nunc fermentum condimentum consectetur Morbi tellus mauris tristique tincidunt elit consectetur hendrerit placerat dui In nulla erat finibus eget erat a hendrerit sodales urna In sapien purus auctor sit amet congue ut congue eget nisi Vivamus sed neque ut ligula lobortis accumsan quis id metus In feugiat velit et leo mattis non fringilla dui elementum Proin a nisi ac sapien vulputate consequat Vestibulum eu tellus mi Integer consectetur efficitur" +) + +type metric struct { + name string + value string +} + +// ValidateMetrics takes the metric name, metric dimension and corresponding namespace that contains the metric +func ValidateMetrics(t *testing.T, metricName, namespace string, dimensionsFilter []types.DimensionFilter) { + cwmClient, clientContext, err := GetCloudWatchMetricsClient() + if err != nil { + t.Fatalf("Error occurred while creating CloudWatch Logs SDK client: %v", err.Error()) + } + + listMetricsInput := cloudwatch.ListMetricsInput{ + MetricName: aws.String(metricName), + Namespace: aws.String(namespace), + RecentlyActive: "PT3H", + Dimensions: dimensionsFilter, + } + data, err := cwmClient.ListMetrics(*clientContext, &listMetricsInput) + if err != nil { + t.Errorf("Error getting metric data %v", err) + } + + // Only validate if certain metrics are published by CloudWatchAgent in corresponding namespace + // Since the metric value can be unpredictive. + if len(data.Metrics) == 0 { + metrics := make([]metric, len(dimensionsFilter)) + for i, filter := range dimensionsFilter { + metrics[i] = metric{ + name: *filter.Name, + value: *filter.Value, + } + } + t.Errorf("No metrics found for dimension %v metric name %v namespace %v", + metrics, metricName, namespace) + } + +} + +// getCloudWatchMetricsClient returns a singleton SDK client for interfacing with CloudWatch Metrics +func GetCloudWatchMetricsClient() (*cloudwatch.Client, *context.Context, error) { + if cwm == nil { + metricsCtx = context.Background() + c, err := config.LoadDefaultConfig(metricsCtx) + if err != nil { + return nil, nil, err + } + + cwm = cloudwatch.NewFromConfig(c) + } + return cwm, &metricsCtx, nil +} + +func BuildDimensionFilterList(appendDimension int) []types.DimensionFilter { + // we append dimension from 0 to max number - 2 + // then we add dimension instance id + // thus for max dimension 10, 0 to 8 + instance id = 10 dimension + ec2InstanceId := GetInstanceId() + dimensionFilter := make([]types.DimensionFilter, appendDimension) + for i := 0; i < appendDimension-1; i++ { + dimensionFilter[i] = types.DimensionFilter{ + Name: aws.String(fmt.Sprintf("%s%d", appendMetric, i)), + Value: aws.String(fmt.Sprintf("%s%d", loremIpsum+appendMetric, i)), + } + } + dimensionFilter[appendDimension-1] = types.DimensionFilter{ + Name: aws.String(instanceId), + Value: aws.String(ec2InstanceId), + } + return dimensionFilter +} \ No newline at end of file diff --git a/integration/test/metrics_number_dimension/metrics_number_dimension_test.go b/integration/test/metrics_number_dimension/metrics_number_dimension_test.go index 78e020af8c..f34e0c3e37 100644 --- a/integration/test/metrics_number_dimension/metrics_number_dimension_test.go +++ b/integration/test/metrics_number_dimension/metrics_number_dimension_test.go @@ -7,7 +7,6 @@ package metrics_number_dimension import ( - "context" "fmt" "log" "testing" @@ -16,7 +15,6 @@ import ( "github.com/aws/amazon-cloudwatch-agent/integration/test" cwPlugin "github.com/aws/amazon-cloudwatch-agent/plugins/outputs/cloudwatch" "github.com/aws/aws-sdk-go-v2/aws" - "github.com/aws/aws-sdk-go-v2/service/cloudwatch" "github.com/aws/aws-sdk-go-v2/service/cloudwatch/types" ) @@ -89,29 +87,8 @@ func TestNumberMetricDimension(t *testing.T) { test.StopAgent() // test for cloud watch metrics - cxt := context.Background() dimensionFilter := buildDimensionFilterList(parameter.numberDimensionsInCW) - client := test.GetCWClient(cxt) - listMetricsInput := cloudwatch.ListMetricsInput{ - MetricName: aws.String(parameter.metricName), - Namespace: aws.String(namespace), - Dimensions: dimensionFilter, - } - data, err := client.ListMetrics(cxt, &listMetricsInput) - if err != nil { - t.Errorf("Error getting metric data %v", err) - } - if len(data.Metrics) == 0 { - metrics := make([]metric, len(dimensionFilter)) - for i, filter := range dimensionFilter { - metrics[i] = metric{ - name: *filter.Name, - value: *filter.Value, - } - } - t.Errorf("No metrics found for dimension %v metric name %v namespace %v", - metrics, parameter.metricName, namespace) - } + test.ValidateMetrics(t, parameter.metricName, namespace, dimensionFilter) }) } } diff --git a/integration/test/nvidia_gpu/metrics_nvidia_gpu_linux_test.go b/integration/test/nvidia_gpu/metrics_nvidia_gpu_linux_test.go new file mode 100644 index 0000000000..1efb9f76d0 --- /dev/null +++ b/integration/test/nvidia_gpu/metrics_nvidia_gpu_linux_test.go @@ -0,0 +1,53 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +//go:build linux && integration +// +build linux,integration + +package metrics_nvidia_gpu + +import ( + "github.com/aws/amazon-cloudwatch-agent/integration/test" + "github.com/aws/amazon-cloudwatch-agent/internal/util/security" + "testing" + "time" +) + +const ( + configLinuxJSON = "resources/config_linux.json" + metricLinuxNamespace = "NvidiaGPULinuxTest" + configLinuxOutputPath = "/opt/aws/amazon-cloudwatch-agent/bin/config.json" + agentLinuxLogPath = "/opt/aws/amazon-cloudwatch-agent/logs/amazon-cloudwatch-agent.log" + agentLinuxRuntime = 2 * time.Minute + agentLinuxPermission = "root" + numberofLinuxAppendDimensions = 1 +) + +var expectedNvidiaGPULinuxMetrics = []string{"mem_used_percent", "nvidia_smi_utilization_gpu", "nvidia_smi_utilization_memory", "nvidia_smi_power_draw", "nvidia_smi_temperature_gpu"} + +func TestNvidiaGPU(t *testing.T) { + t.Run("Basic configuration testing for both metrics and logs", func(t *testing.T) { + test.CopyFile(configLinuxJSON, configLinuxOutputPath) + test.StartAgent(configLinuxOutputPath, true) + + time.Sleep(agentLinuxRuntime) + t.Logf("Agent has been running for : %s", agentLinuxRuntime.String()) + test.StopAgent() + + dimensionFilter := test.BuildDimensionFilterList(numberofLinuxAppendDimensions) + for _, metricName := range expectedNvidiaGPULinuxMetrics { + test.ValidateMetrics(t, metricName, metricLinuxNamespace, dimensionFilter) + } + + if err := security.CheckFileRights(agentLinuxLogPath); err != nil { + t.Fatalf("CloudWatchAgent does not have privellege to write and read CWA's log: %v", err) + } + + if err := security.CheckFileOwnerRights(agentLinuxLogPath,agentLinuxPermission); err != nil { + t.Fatalf("CloudWatchAgent does not have right to CWA's log: %v", err) + } + + }) +} + + diff --git a/integration/test/nvidia_gpu/metrics_nvidia_gpu_window_test.go b/integration/test/nvidia_gpu/metrics_nvidia_gpu_window_test.go new file mode 100644 index 0000000000..6c272f2001 --- /dev/null +++ b/integration/test/nvidia_gpu/metrics_nvidia_gpu_window_test.go @@ -0,0 +1,60 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +//go:build windows && integration +// +build windows,integration + +package metrics_nvidia_gpu + +import ( + "github.com/aws/amazon-cloudwatch-agent/integration/test" + "github.com/aws/amazon-cloudwatch-agent/internal/util/security" + "testing" + "time" +) + +const ( + configWindowsJSON = "resources/config_windows.json" + metricWindowsnamespace = "NvidiaGPUWindowsTest" + configWindowsOutputPath = "C:\\ProgramData\\Amazon\\AmazonCloudWatchAgent\\config.json" + agentWindowsLogPath = "C:\\ProgramData\\Amazon\\AmazonCloudWatchAgent\\Logs\\amazon-cloudwatch-agent.log" + agentWindowsRuntime = 3 * time.Minute + numberofWindowsAppendDimensions = 1 +) + +var expectedNvidiaGPUWindowsMetrics = []string{"Memory % Committed Bytes In Use", "nvidia_smi utilization_gpu", "nvidia_smi utilization_memory", "nvidia_smi power_draw", "nvidia_smi temperature_gpu"} + +func TestNvidiaGPUWindows(t *testing.T) { + t.Run("Run CloudWatchAgent with Nvidia-smi on Windows", func(t *testing.T) { + err := test.CopyFile(configWindowsJSON, configWindowsOutputPath) + + if err != nil { + t.Fatalf(err.Error()) + } + + err = test.StartAgent(configWindowsOutputPath, true) + + if err != nil { + t.Fatalf(err.Error()) + } + + time.Sleep(agentWindowsRuntime) + t.Logf("Agent has been running for : %s", agentWindowsRuntime.String()) + err = test.StopAgent() + + if err != nil { + t.Fatalf(err.Error()) + } + + dimensionFilter := test.BuildDimensionFilterList(numberofWindowsAppendDimensions) + for _, metricName := range expectedNvidiaGPUWindowsMetrics { + test.ValidateMetrics(t, metricName, metricWindowsnamespace, dimensionFilter) + } + + err = security.CheckFileRights(agentWindowsLogPath) + if err != nil { + t.Fatalf("CloudWatchAgent's log does not have protection from local system and admin: %v", err) + } + + }) +} diff --git a/integration/test/nvidia_gpu/resources/config_linux.json b/integration/test/nvidia_gpu/resources/config_linux.json new file mode 100644 index 0000000000..05432709df --- /dev/null +++ b/integration/test/nvidia_gpu/resources/config_linux.json @@ -0,0 +1,30 @@ +{ + "agent": { + "metrics_collection_interval": 60, + "run_as_user": "root", + "debug": true + }, + "metrics": { + "namespace": "NvidiaGPULinuxTest", + "append_dimensions": { + "InstanceId": "${aws:InstanceId}" + }, + "metrics_collected": { + "nvidia_gpu": { + "measurement": [ + "utilization_gpu", + "utilization_memory", + "power_draw", + "temperature_gpu" + ], + "metrics_collection_interval": 1 + }, + "mem": { + "measurement": [ + "mem_used_percent" + ], + "metrics_collection_interval": 1 + } + } + } +} \ No newline at end of file diff --git a/integration/test/nvidia_gpu/resources/config_windows.json b/integration/test/nvidia_gpu/resources/config_windows.json new file mode 100644 index 0000000000..3042e72f44 --- /dev/null +++ b/integration/test/nvidia_gpu/resources/config_windows.json @@ -0,0 +1,25 @@ +{ + "metrics": { + "namespace": "NvidiaGPUWindowsTest", + "append_dimensions": { + "InstanceId": "${aws:InstanceId}" + }, + "metrics_collected": { + "Memory": { + "measurement": [ + "% Committed Bytes In Use" + ], + "metrics_collection_interval": 1 + }, + "nvidia_gpu": { + "measurement": [ + "utilization_gpu", + "utilization_memory", + "power_draw", + "temperature_gpu" + ], + "metrics_collection_interval": 1 + } + } + } +} \ No newline at end of file diff --git a/integration/test/performancetest/transmitter.go b/integration/test/performancetest/transmitter.go index 83b2200c6b..aeb3745a96 100644 --- a/integration/test/performancetest/transmitter.go +++ b/integration/test/performancetest/transmitter.go @@ -1,3 +1,4 @@ + package performancetest import ( diff --git a/integration/test/sanity/sanity_windows.go b/integration/test/sanity/sanity_windows.go index e7b61380b3..dece43ea10 100644 --- a/integration/test/sanity/sanity_windows.go +++ b/integration/test/sanity/sanity_windows.go @@ -12,7 +12,7 @@ import ( ) func SanityCheck(t *testing.T) { - err := test.RunPowerShellScript("resources/verifyWindowsCtlScript.ps1") + err := test.RunShellScript("resources/verifyWindowsCtlScript.ps1") if err != nil { t.Fatalf("Running sanity check failed") } diff --git a/integration/test/util.go b/integration/test/util.go new file mode 100644 index 0000000000..b363870efb --- /dev/null +++ b/integration/test/util.go @@ -0,0 +1,31 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +//go:build integration +// +build integration + +package test + +import ( + "context" + "log" + "github.com/aws/aws-sdk-go-v2/config" + "github.com/aws/aws-sdk-go-v2/feature/ec2/imds" +) + +func GetInstanceId() string { + ctx := context.Background() + c, err := config.LoadDefaultConfig(ctx) + if err != nil { + // fail fast so we don't continue the test + log.Fatalf("Error occurred while creating SDK config: %v", err) + } + + // TODO: this only works for EC2 based testing + client := imds.NewFromConfig(c) + metadata, err := client.GetInstanceIdentityDocument(ctx, &imds.GetInstanceIdentityDocumentInput{}) + if err != nil { + log.Fatalf("Error occurred while retrieving EC2 instance ID: %v", err) + } + return metadata.InstanceID +} diff --git a/internal/util/security/unix_permission.go b/internal/util/security/unix_permission.go new file mode 100644 index 0000000000..1ac689af0a --- /dev/null +++ b/internal/util/security/unix_permission.go @@ -0,0 +1,51 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +// go:build !windows +// +build !windows + +package security + +import ( + "fmt" + "syscall" + "os/user" +) + +// CheckFileRights check that the given file path has been protected by the owner. +// If the owner is changed, they need at least the sudo permission to override the owner. +func CheckFileRights(filePath string) error { + var stat syscall.Stat_t + if err := syscall.Stat(filePath, &stat); err != nil { + return fmt.Errorf("Cannot get file's stat %s: %v", filePath, err) + } + + // Check the owner of binary has read, write, exec. + if !(stat.Mode&(syscall.S_IXUSR) == 0 || stat.Mode&(syscall.S_IRUSR) == 0 || stat.Mode&(syscall.S_IWUSR) == 0) { + return nil + } + + // Check the owner of file has read, write + if !(stat.Mode&(syscall.S_IRUSR) == 0 || stat.Mode&(syscall.S_IWUSR) == 0) { + return nil + } + + return fmt.Errorf("File's owner does not have enough permission at path %s", filePath) +} + + +// CheckFileOwnerRights check that the given owner is the same owner of the given filepath +func CheckFileOwnerRights(filePath, requiredOwner string) error { + var stat syscall.Stat_t + if err := syscall.Stat(filePath, &stat); err != nil { + return fmt.Errorf("Cannot get file's stat %s: %v", filePath, err) + } + + if owner, err := user.LookupId(fmt.Sprintf("%d", stat.Uid)); err != nil { + return fmt.Errorf("Cannot look up file owner's name %s: %v", filePath, err) + } else if owner.Name != requiredOwner { + return fmt.Errorf("Agent does not have permission to protect file %s", filePath) + } + + return nil +} \ No newline at end of file diff --git a/internal/util/security/windows_permission.go b/internal/util/security/windows_permission.go new file mode 100644 index 0000000000..6eda9eaa56 --- /dev/null +++ b/internal/util/security/windows_permission.go @@ -0,0 +1,109 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +// go:build windows +// +build windows + +package security + +import ( + "fmt" + "golang.org/x/sys/windows" + "os" + "unsafe" +) + +// CheckFileRights check that the given filename has access controls and system permission for Administrator, Local System +func CheckFileRights(filePath string) error { + if _, err := os.Stat(filePath); err != nil { + return fmt.Errorf("Cannot get file's stat %s: %v", filePath, err) + } + + var fileDacl *Acl + err := GetNamedSecurityInfo(filePath, + SE_FILE_OBJECT, + DACL_SECURITY_INFORMATION, + nil, + nil, + &fileDacl, + nil, + nil) + + if err != nil { + return fmt.Errorf("Cannot get file security info %s: %s", filePath, err) + } + + var aclSizeInfo AclSizeInformation + err = GetAclInformation(fileDacl, &aclSizeInfo, AclSizeInformationEnum) + if err != nil { + return fmt.Errorf("Cannot query file's ACLs %s: %s", filePath, err) + } + + // create the sids that are acceptable to us (local system account and administrators group) + // For more information on account type: https://stackoverflow.com/a/510225 + var localSystem *windows.SID + + err = windows.AllocateAndInitializeSid(&windows.SECURITY_NT_AUTHORITY, + 1, // local system has 1 valid subauth + windows.SECURITY_LOCAL_SYSTEM_RID, + 0, 0, 0, 0, 0, 0, 0, + &localSystem) + + if err != nil { + return fmt.Errorf("Cannot initialize Local System SID: %v", err) + } + + defer windows.FreeSid(localSystem) + + var administrators *windows.SID + + err = windows.AllocateAndInitializeSid(&windows.SECURITY_NT_AUTHORITY, + 2, // administrators group has 2 valid subauths + windows.SECURITY_BUILTIN_DOMAIN_RID, + windows.DOMAIN_ALIAS_RID_ADMINS, + 0, 0, 0, 0, 0, 0, + &administrators) + + if err != nil { + return fmt.Errorf("Cannot initialize Administrator SID: %s", err) + } + + defer windows.FreeSid(administrators) + + hasFileAllAccessLocalSystem := false + hasFileAllAccessAdministrators := false + + for i := uint32(0); i < aclSizeInfo.AceCount; i++ { + var pAce *AccessAllowedAce + if err := GetAce(fileDacl, i, &pAce); err != nil { + return fmt.Errorf("Could not query a ACE on %s with: %s", filePath, err) + } + + compareSid := (*windows.SID)(unsafe.Pointer(&pAce.SidStart)) + compareIsLocalSystem := windows.EqualSid(compareSid, localSystem) + compareIsAdministrators := windows.EqualSid(compareSid, administrators) + + if pAce.AceType == ACCESS_DENIED_ACE_TYPE { + // if the file has denied access to local system or administrators, then it cannot be protected by those accounts + if compareIsLocalSystem || compareIsAdministrators { + return fmt.Errorf("File %s has deny access for Administrators and Local System", filePath) + } + } + + if pAce.AccessMask == FILE_ALL_ACCESS { + if compareIsLocalSystem { + hasFileAllAccessLocalSystem = true + } + if compareIsAdministrators { + hasFileAllAccessAdministrators = true + } + } + + } + + if !hasFileAllAccessLocalSystem || !hasFileAllAccessAdministrators { + return fmt.Errorf("No highest file access for Administrators and Local System with %s", filePath) + } + + return nil +} diff --git a/internal/util/security/windows_sec.go b/internal/util/security/windows_sec.go new file mode 100644 index 0000000000..4d97a20094 --- /dev/null +++ b/internal/util/security/windows_sec.go @@ -0,0 +1,109 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +// go:build windows +// +build windows + +package security + +import ( + "syscall" + "unsafe" + + "golang.org/x/sys/windows" +) + +// https://docs.microsoft.com/en-us/windows/win32/api/accctrl/ne-accctrl-se_object_type +const ( + SE_UNKNOWN_OBJECT_TYPE = iota + SE_FILE_OBJECT +) + +// https://github.com/mhammond/pywin32/blob/70ddf693927fa1635f15e9ef41eb1aea37fdf32a/win32/Lib/ntsecuritycon.py +const ( + ACCESS_ALLOWED_ACE_TYPE = 0 + ACCESS_DENIED_ACE_TYPE = 1 + FILE_ALL_ACCESS = (windows.STANDARD_RIGHTS_ALL | 0x1FF) +) + +const ( + AclSizeInformationEnum = 2 + DACL_SECURITY_INFORMATION = 0x00004 +) + +var ( + advapi32 = syscall.NewLazyDLL("advapi32.dll") + procGetAclInformation = advapi32.NewProc("GetAclInformation") + procGetNamedSecurityInfo = advapi32.NewProc("GetNamedSecurityInfoW") + procGetAce = advapi32.NewProc("GetAce") +) + +// https://docs.microsoft.com/en-us/windows/win32/api/winnt/ns-winnt-acl_size_information +type AclSizeInformation struct { + AceCount uint32 + AclBytesInUse uint32 + AclBytesFree uint32 +} + +// https://docs.microsoft.com/en-us/windows/win32/api/winnt/ns-winnt-acl +type Acl struct { + AclRevision uint8 + Sbz1 uint8 + AclSize uint16 + AceCount uint16 + Sbz2 uint16 +} + +// https://docs.microsoft.com/en-us/windows-hardware/drivers/ddi/ntifs/ns-ntifs-_access_allowed_ace +type AccessAllowedAce struct { + AceType uint8 + AceFlags uint8 + AceSize uint16 + AccessMask uint32 + SidStart uint32 +} + +// Retrieve a copy of security descriptor for an object specified by name (e.g a file) +// For more information: https://docs.microsoft.com/en-us/windows/win32/api/aclapi/nf-aclapi-getnamedsecurityinfoa +func GetNamedSecurityInfo(objectName string, objectType int32, secInfo uint32, owner, group **windows.SID, dacl, sacl **Acl, secDesc *windows.Handle) error { + ret, _, err := procGetNamedSecurityInfo.Call( + uintptr(unsafe.Pointer(windows.StringToUTF16Ptr(objectName))), + uintptr(objectType), + uintptr(secInfo), + uintptr(unsafe.Pointer(owner)), + uintptr(unsafe.Pointer(group)), + uintptr(unsafe.Pointer(dacl)), + uintptr(unsafe.Pointer(sacl)), + uintptr(unsafe.Pointer(secDesc)), + ) + if ret != 0 { + return err + } + return nil +} + +// Retrieve information about access control list (e.g a file) +// For more information: https://docs.microsoft.com/en-us/windows/win32/api/securitybaseapi/nf-securitybaseapi-getaclinformation +func GetAclInformation(acl *Acl, info *AclSizeInformation, class uint32) error { + length := unsafe.Sizeof(*info) + ret, _, _ := procGetAclInformation.Call( + uintptr(unsafe.Pointer(acl)), + uintptr(unsafe.Pointer(info)), + uintptr(length), + uintptr(class)) + + if int(ret) == 0 { + return windows.GetLastError() + } + return nil +} + +// Obtain a pointer to an access control entry (ACE) in an access control list (ACL). +// For more information: https://docs.microsoft.com/en-us/windows/win32/api/securitybaseapi/nf-securitybaseapi-getace +func GetAce(acl *Acl, index uint32, ace **AccessAllowedAce) error { + ret, _, _ := procGetAce.Call(uintptr(unsafe.Pointer(acl)), uintptr(index), uintptr(unsafe.Pointer(ace))) + if int(ret) != 0 { + return windows.GetLastError() + } + return nil +} diff --git a/translator/translate/metrics/util/commonconfigutil.go b/translator/translate/metrics/util/commonconfigutil.go index 5d9f1b3e8a..d52dc9e751 100755 --- a/translator/translate/metrics/util/commonconfigutil.go +++ b/translator/translate/metrics/util/commonconfigutil.go @@ -198,7 +198,7 @@ func ProcessMetricsAggregationInterval(input interface{}, defaultValue, pluginNa return } -//check if desiredVal exist in inputs list +// check if desiredVal exist in inputs list func ListContains(inputs []string, desiredVal string) bool { for _, val := range inputs { if val == desiredVal { diff --git a/translator/translate/metrics/util/measurementutil.go b/translator/translate/metrics/util/measurementutil.go index bd8781bb90..6337f2838d 100755 --- a/translator/translate/metrics/util/measurementutil.go +++ b/translator/translate/metrics/util/measurementutil.go @@ -14,16 +14,22 @@ import ( "github.com/aws/amazon-cloudwatch-agent/translator/translate/metrics/config" ) -const field_pass_key = "fieldpass" -const windows_measurement_key = "Counters" -const measurement_name = "name" -const measurement_category = "category" -const measurement_rename = "rename" -const measurement_unit = "unit" -const nvidia_smi_plugin_name = "nvidia_smi" -const tag_exclude_key = "tagexclude" -const smi_bin_path = "bin_path" -const default_windows_smi_path = "C:\\Program Files\\NVIDIA Corporation\\NVSMI\\nvidia-smi.exe" +const ( + tag_exclude_key = "tagexclude" + field_pass_key = "fieldpass" + windows_measurement_key = "Counters" + measurement_name = "name" + measurement_category = "category" + measurement_rename = "rename" + measurement_unit = "unit" +) + +const ( + smi_bin_path = "bin_path" + nvidia_smi_plugin_name = "nvidia_smi" + Default_Unix_Smi_Path = "/usr/bin/nvidia-smi" + Default_Windows_Smi_Path = "C:\\Program Files\\NVIDIA Corporation\\NVSMI\\nvidia-smi.exe" +) func ApplyMeasurementRule(inputs interface{}, pluginName string, targetOs string, path string) (returnKey string, returnVal []string) { inputList := inputs.([]interface{}) @@ -148,13 +154,15 @@ func isDecorationAvail(observationMap map[string]interface{}) bool { return false } -// "measurement": [ -// {"name": "cpu_usage_idle", "rename": "CPU_USAGE_IDLE", "unit": "unit"}, -// {"name": "cpu_usage_nice", "unit": "unit"}, -// "cpu_usage_guest", -// "time_active", -// "usage_active" -// ] +// "measurement": [ +// +// {"name": "cpu_usage_idle", "rename": "CPU_USAGE_IDLE", "unit": "unit"}, +// {"name": "cpu_usage_nice", "unit": "unit"}, +// "cpu_usage_guest", +// "time_active", +// "usage_active" +// +// ] func GetMeasurementName(input interface{}) (measurementNames []string) { m := input.(map[string]interface{}) if metricList, ok := m["measurement"]; ok { @@ -174,7 +182,7 @@ func GetMeasurementName(input interface{}) (measurementNames []string) { } // ApplyPluginSpecificRules returns a map contains all the rules for tagpass, tagdrop, namepass, namedrop, -//fieldpass, fielddrop, taginclude, tagexclude specifically for certain plugin. +// fieldpass, fielddrop, taginclude, tagexclude specifically for certain plugin. func ApplyPluginSpecificRules(pluginName string) (map[string]interface{}, bool) { switch pluginName { case nvidia_smi_plugin_name: @@ -183,7 +191,7 @@ func ApplyPluginSpecificRules(pluginName string) (map[string]interface{}, bool) if translator.GetTargetPlatform() == translatorConfig.OS_TYPE_WINDOWS { // default path for Nvidia_smi.exe is C:\\Program Files\\NVIDIA Corporation\\NVSMI\\nvidia-smi.exe // Todo: for windows 10 the path should default to C:\\Windows\\System32\\nvidia-smi.exe will support in the future - result[smi_bin_path] = default_windows_smi_path + result[smi_bin_path] = Default_Windows_Smi_Path } return result, true default: