diff --git a/.github/workflows/nfr.yml b/.github/workflows/nfr.yml index 9826b212c..8ce37fc73 100644 --- a/.github/workflows/nfr.yml +++ b/.github/workflows/nfr.yml @@ -144,9 +144,9 @@ jobs: working-directory: ./tests run: | if ${{ inputs.test_label != 'all' }}; then - sed -i '/^GINKGO_LABEL=/s/=.*/="${{ inputs.test_label }}"/' "scripts/vars.env" && make run-tests-on-vm; + sed -i '/^GINKGO_LABEL=/s/=.*/="${{ inputs.test_label }}"/' "scripts/vars.env" && make nfr-test; else - make run-tests-on-vm; + make nfr-test; fi - name: Cleanup diff --git a/.gitignore b/.gitignore index 81ad399f7..a87ca2ab3 100644 --- a/.gitignore +++ b/.gitignore @@ -51,3 +51,6 @@ internal/mode/static/nginx/modules/coverage # Credential files **/gha-creds-*.json + +# SSH config files +*.ssh diff --git a/.yamllint.yaml b/.yamllint.yaml index 20470b80d..478262b7d 100644 --- a/.yamllint.yaml +++ b/.yamllint.yaml @@ -41,7 +41,7 @@ rules: .github/ deploy/manifests/nginx-gateway.yaml deploy/manifests/crds - tests/longevity/manifests/cronjob.yaml + tests/suite/manifests/longevity/cronjob.yaml .goreleaser.yml new-line-at-end-of-file: enable new-lines: enable diff --git a/tests/Makefile b/tests/Makefile index 6b32e47d0..463fb6f01 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -32,6 +32,10 @@ help: Makefile ## Display this help create-kind-cluster: ## Create a kind cluster cd .. && make create-kind-cluster +.PHONY: delete-kind-cluster +delete-kind-cluster: ## Delete kind cluster + kind delete cluster + .PHONY: build-images build-images: ## Build NGF and NGINX images cd .. && make PREFIX=$(PREFIX) TAG=$(TAG) build-images @@ -48,46 +52,70 @@ load-images: ## Load NGF and NGINX images on configured kind cluster load-images-with-plus: ## Load NGF and NGINX Plus images on configured kind cluster cd .. && make PREFIX=$(PREFIX) TAG=$(TAG) load-images-with-plus -test: ## Run the system tests against your default k8s cluster - go test -v ./suite $(GINKGO_FLAGS) -args --gateway-api-version=$(GW_API_VERSION) \ - --gateway-api-prev-version=$(GW_API_PREV_VERSION) --image-tag=$(TAG) --version-under-test=$(NGF_VERSION) \ - --plus-enabled=$(PLUS_ENABLED) --ngf-image-repo=$(PREFIX) --nginx-image-repo=$(NGINX_PREFIX) \ - --pull-policy=$(PULL_POLICY) --k8s-version=$(K8S_VERSION) --service-type=$(GW_SERVICE_TYPE) \ - --is-gke-internal-lb=$(GW_SVC_GKE_INTERNAL) +.PHONY: setup-gcp-and-run-tests +setup-gcp-and-run-tests: create-gke-router create-and-setup-vm run-tests-on-vm ## Create and setup a GKE router and GCP VM for tests and run the functional tests -.PHONY: delete-kind-cluster -delete-kind-cluster: ## Delete kind cluster - kind delete cluster +.PHONY: setup-gcp-and-run-nfr-tests +setup-gcp-and-run-nfr-tests: create-gke-router create-and-setup-vm nfr-test ## Create and setup a GKE router and GCP VM for tests and run the NFR tests -.PHONY: run-tests-on-vm -run-tests-on-vm: ## Run the tests on a GCP VM - bash scripts/run-tests-gcp-vm.sh +.PHONY: create-gke-cluster +create-gke-cluster: ## Create a GKE cluster + bash scripts/create-gke-cluster.sh $(CI) .PHONY: create-and-setup-vm create-and-setup-vm: ## Create and setup a GCP VM for tests bash scripts/create-and-setup-gcp-vm.sh -.PHONY: cleanup-vm -cleanup-vm: ## Delete the test GCP VM and delete the firewall rule - bash scripts/cleanup-vm.sh - .PHONY: create-gke-router create-gke-router: ## Create a GKE router to allow egress traffic from private nodes (allows for external image pulls) bash scripts/create-gke-router.sh -.PHONY: cleanup-router -cleanup-router: ## Delete the GKE router - bash scripts/cleanup-router.sh +.PHONY: sync-files-to-vm +sync-files-to-vm: ## Syncs your local NGF files with the NGF repo on the VM + bash scripts/sync-files-to-vm.sh -.PHONY: setup-gcp-and-run-tests -setup-gcp-and-run-tests: create-gke-router create-and-setup-vm run-tests-on-vm ## Create and setup a GKE router and GCP VM for tests and run the tests +.PHONY: run-tests-on-vm +run-tests-on-vm: ## Run the functional tests on a GCP VM + bash scripts/run-tests-gcp-vm.sh + +.PHONY: nfr-test +nfr-test: ## Run the NFR tests on a GCP VM + NFR=true bash scripts/run-tests-gcp-vm.sh + +.PHONY: start-longevity-test +start-longevity-test: ## Start the longevity test to run for 4 days in GKE + START_LONGEVITY=true $(MAKE) nfr-test + +.PHONY: stop-longevity-test +stop-longevity-test: ## Stops the longevity test and collects results + STOP_LONGEVITY=true $(MAKE) nfr-test + +.PHONY: .vm-nfr-test +.vm-nfr-test: ## Runs the NFR tests on the GCP VM (called by `nfr-test`) + go test -v ./suite -ginkgo.label-filter "nfr" $(GINKGO_FLAGS) -ginkgo.v -args --gateway-api-version=$(GW_API_VERSION) \ + --gateway-api-prev-version=$(GW_API_PREV_VERSION) --image-tag=$(TAG) --version-under-test=$(NGF_VERSION) \ + --plus-enabled=$(PLUS_ENABLED) --ngf-image-repo=$(PREFIX) --nginx-image-repo=$(NGINX_PREFIX) \ + --pull-policy=$(PULL_POLICY) --k8s-version=$(K8S_VERSION) --service-type=$(GW_SERVICE_TYPE) \ + --is-gke-internal-lb=$(GW_SVC_GKE_INTERNAL) + +.PHONY: test +test: ## Runs the functional tests on your default k8s cluster + go test -v ./suite -ginkgo.label-filter "functional" $(GINKGO_FLAGS) -args --gateway-api-version=$(GW_API_VERSION) \ + --gateway-api-prev-version=$(GW_API_PREV_VERSION) --image-tag=$(TAG) --version-under-test=$(NGF_VERSION) \ + --plus-enabled=$(PLUS_ENABLED) --ngf-image-repo=$(PREFIX) --nginx-image-repo=$(NGINX_PREFIX) \ + --pull-policy=$(PULL_POLICY) --k8s-version=$(K8S_VERSION) --service-type=$(GW_SERVICE_TYPE) \ + --is-gke-internal-lb=$(GW_SVC_GKE_INTERNAL) .PHONY: cleanup-gcp cleanup-gcp: cleanup-router cleanup-vm delete-gke-cluster ## Cleanup all GCP resources -.PHONY: create-gke-cluster -create-gke-cluster: ## Create a GKE cluster - bash scripts/create-gke-cluster.sh $(CI) +.PHONY: cleanup-router +cleanup-router: ## Delete the GKE router + bash scripts/cleanup-router.sh + +.PHONY: cleanup-vm +cleanup-vm: ## Delete the test GCP VM and delete the firewall rule + bash scripts/cleanup-vm.sh .PHONY: delete-gke-cluster delete-gke-cluster: ## Delete the GKE cluster @@ -95,4 +123,4 @@ delete-gke-cluster: ## Delete the GKE cluster .PHONY: add-local-ip-to-cluster add-local-ip-to-cluster: ## Add local IP to the GKE cluster master-authorized-networks - bash scripts/add-local-ip-to-cluster.sh + bash scripts/add-local-ip-auth-networks.sh diff --git a/tests/README.md b/tests/README.md index 07a8ea141..50d0f5945 100644 --- a/tests/README.md +++ b/tests/README.md @@ -4,19 +4,22 @@ The tests in this directory are meant to be run on a live Kubernetes environment are similar to the existing [conformance tests](../conformance/README.md), but will verify things such as: - NGF-specific functionality -- Non-Functional requirements testing (such as performance, scale, etc.) +- Non-Functional requirements (NFR) testing (such as performance, scale, etc.) When running locally, the tests create a port-forward from your NGF Pod to localhost using a port chosen by the test framework. Traffic is sent over this port. If running on a GCP VM targeting a GKE cluster, the tests will create an internal LoadBalancer service which will receive the test traffic. +**Important**: NFR tests can only be run on a GKE cluster. + Directory structure is as follows: - `framework`: contains utility functions for running the tests -- `suite`: contains the test files - `results`: contains the results files +- `scripts`: contain scripts used to set up the environment and run the tests +- `suite`: contains the test files -**Note**: Existing NFR tests will be migrated into this testing `suite` and results stored in the `results` directory. +> Note: Existing NFR tests will be migrated into this testing `suite` and results stored in the `results` directory. ## Prerequisites @@ -24,13 +27,13 @@ Directory structure is as follows: - Docker. - Golang. -If running the tests on a VM (`make create-vm-and-run-tests` or `make run-tests-on-vm`): +If running NFR tests, or running functional tests in GKE: - The [gcloud CLI](https://cloud.google.com/sdk/docs/install) - A GKE cluster (if `master-authorized-networks` is enabled, please set `ADD_VM_IP_AUTH_NETWORKS=true` in your vars.env file) - Access to GCP Service Account with Kubernetes admin permissions -**Note**: all commands in steps below are executed from the `tests` directory +> Note: all commands in steps below are executed from the `tests` directory ```shell make @@ -52,9 +55,14 @@ delete-kind-cluster Delete kind cluster help Display this help load-images-with-plus Load NGF and NGINX Plus images on configured kind cluster load-images Load NGF and NGINX images on configured kind cluster -run-tests-on-vm Run the tests on a GCP VM -setup-gcp-and-run-tests Create and setup a GKE router and GCP VM for tests and run the tests -test Run the system tests against your default k8s cluster +nfr-test Run the NFR tests on a GCP VM +run-tests-on-vm Run the functional tests on a GCP VM +setup-gcp-and-run-nfr-tests Create and setup a GKE router and GCP VM for tests and run the NFR tests +setup-gcp-and-run-tests Create and setup a GKE router and GCP VM for tests and run the functional tests +start-longevity-test Start the longevity test to run for 4 days in GKE +stop-longevity-test Stops the longevity test and collects results +sync-files-to-vm Syncs your local NGF files with the NGF repo on the VM +test Runs the functional tests on your default k8s cluster ``` **Note:** The following variables are configurable when running the below `make` commands: @@ -78,6 +86,8 @@ test Run the system tests against your default k8s clu This can be done in a cloud provider of choice, or locally using `kind`. +**Important**: NFR tests can only be run on a GKE cluster. + To create a local `kind` cluster: ```makefile @@ -128,7 +138,7 @@ make build-images-with-plus load-images-with-plus TAG=$(whoami) ## Step 3 - Run the tests -### 3a - Run the tests locally +### 3a - Run the functional tests locally ```makefile make test TAG=$(whoami) @@ -142,9 +152,9 @@ make test TAG=$(whoami) PLUS_ENABLED=true ### 3b - Run the tests on a GKE cluster from a GCP VM -This step only applies if you would like to run the tests on a GKE cluster from a GCP based VM. +This step only applies if you are running the NFR tests, or would like to run the functional tests on a GKE cluster from a GCP based VM. -Before running the below `make` command, copy the `scripts/vars.env-example` file to `scripts/vars.env` and populate the +Before running the below `make` commands, copy the `scripts/vars.env-example` file to `scripts/vars.env` and populate the required env vars. `GKE_SVC_ACCOUNT` needs to be the name of a service account that has Kubernetes admin permissions. In order to run the tests in GCP, you need a few things: @@ -153,30 +163,85 @@ In order to run the tests in GCP, you need a few things: - this assumes that your GKE cluster is using private nodes. If using public nodes, you don't need this. - GCP VM and firewall rule to send ingress traffic to GKE +To just set up the VM with no router (this will not run the tests): + +```makefile +make create-and-setup-vm +``` + +Otherwise, you can set up the VM, router, and run the tests with a single command. See the options in the sections below. + +By default, the tests run using the version of NGF that was `git cloned` during the setup. If you want to make +incremental changes and copy your local changes to the VM to test, you can run + +```makefile +make sync-files-to-vm +``` + +#### Functional Tests + To set up the GCP environment with the router and VM and then run the tests, run the following command: ```makefile make setup-gcp-and-run-tests ``` -If you just need a VM and no router (this will not run the tests): +To use an existing VM to run the tests, run the following ```makefile -make create-and-setup-vm +make run-tests-on-vm +``` + +#### NFR tests + +To set up the GCP environment with the router and VM and then run the tests, run the following command: + + +```makefile +make setup-gcp-and-run-nfr-tests ``` To use an existing VM to run the tests, run the following ```makefile -make run-tests-on-vm +make nfr-test +``` + +##### Longevity testing + +This test is run on its own (and also not in a pipeline) due to its long-running nature. It will run for 4 days before +the tester must collect the results and complete the test. + +To start the longevity test, set up your VM (`create-and-setup-vm`) and run + +```makefile +make start-longevity-test ``` + +> Note: If you want to change the time period for which the test runs, update the `wrk` commands in `suite/scripts/longevity-wrk.sh` to the time period you want, and run `make sync-files-to-vm`. + + +> Note: If you want to re-run the longevity test, you need to clear out the `cafe.example.com` entry from the `/etc/hosts` file on your VM. + +You can verify the test is working by checking nginx logs to see traffic flow, and check that the cronjob is running and redeploying apps. + +After 4 days (96h), you can complete the longevity tests and collect results. To ensure that the traffic has stopped flowing, you can ssh to the VM using `gcloud compute ssh` and run `ps aux | grep wrk` to verify the `wrk` commands are no longer running. Then, visit the [GCP Monitoring Dashboards](https://console.cloud.google.com/monitoring/dashboards) page and select the `NGF Longevity Test` dashboard. Take PNG screenshots of each chart for the time period in which your test ran, and save those to be added to the results file. + +Finally, run + +```makefile +make stop-longevity-test +``` + +This will tear down the test and collect results into a file, where you can add the PNGs of the dashboard. + ### Common test amendments -To run all tests with the label "performance", use the GINKGO_LABEL variable: +To run all tests with the label "my-label", use the GINKGO_LABEL variable: ```makefile -make test TAG=$(whoami) GINKGO_LABEL=performance +make test TAG=$(whoami) GINKGO_LABEL=my-label ``` or to pass a specific flag, e.g. run a specific test, use the GINKGO_FLAGS variable: @@ -185,6 +250,8 @@ or to pass a specific flag, e.g. run a specific test, use the GINKGO_FLAGS varia make test TAG=$(whoami) GINKGO_FLAGS='-ginkgo.focus "writes the system info to a results file"' ``` +> Note: if filtering on NFR tests (or functional tests on GKE), set the filter in the appropriate field in your `vars.env` file. + If you are running the tests in GCP, add your required label/ flags to `scripts/var.env`. You can also modify the tests code for a similar outcome. To run a specific test, you can "focus" it by adding the `F` diff --git a/tests/framework/results.go b/tests/framework/results.go index 5ea944563..429dd40e6 100644 --- a/tests/framework/results.go +++ b/tests/framework/results.go @@ -77,6 +77,15 @@ func WriteResults(resultsFile *os.File, metrics *Metrics) error { return reporter.Report(resultsFile) } +// WriteContent writes basic content to the results file. +func WriteContent(resultsFile *os.File, content string) error { + if _, err := fmt.Fprintln(resultsFile, content); err != nil { + return err + } + + return nil +} + // NewCSVEncoder returns a vegeta CSV encoder. func NewCSVEncoder(w io.Writer) vegeta.Encoder { return vegeta.NewCSVEncoder(w) diff --git a/tests/longevity/longevity.md b/tests/longevity/longevity.md deleted file mode 100644 index 1271678cc..000000000 --- a/tests/longevity/longevity.md +++ /dev/null @@ -1,151 +0,0 @@ -# Longevity Test - -This document describes how we test NGF for longevity. - - - -- [Longevity Test](#longevity-test) - - [Goals](#goals) - - [Test Environment](#test-environment) - - [Steps](#steps) - - [Start](#start) - - [Check the Test is Running Correctly](#check-the-test-is-running-correctly) - - [End](#end) - - [Analyze](#analyze) - - [Results](#results) - - - -## Goals - -- Ensure that NGF successfully processes both control plane and data plane transactions over a period of time much - greater than in our other tests. -- Catch bugs that could only appear over a period of time (like resource leaks). - -## Test Environment - -- A Kubernetes cluster with 3 nodes on GKE - - Node: e2-medium (2 vCPU, 4GB memory) - - Enabled GKE logging. - - Enabled GKE Cloud monitoring with managed Prometheus service, with enabled: - - system. - - kube state - pods, deployments. -- Tester VMs on Google Cloud: - - Configuration: - - Debian - - Install packages: tmux, wrk - - Location - same zone as the Kubernetes cluster. - - First VM - for HTTP traffic - - Second VM - for sending HTTPs traffic -- NGF - - Deployment with 1 replica - - Exposed via a Service with type LoadBalancer, private IP - - Gateway, two listeners - HTTP and HTTPs - - Two apps: - - Coffee - 3 replicas - - Tea - 3 replicas - - Two HTTPRoutes - - Coffee (HTTP) - - Tea (HTTPS) - -## Steps - -### Start - -Test duration - 4 days. - -1. Create a Kubernetes cluster on GKE. -2. Deploy NGF. -3. Expose NGF via a LoadBalancer Service with `"networking.gke.io/load-balancer-type":"Internal"` annotation to - allocate an internal load balancer. -4. Apply the manifests which will: - 1. Deploy the coffee and tea backends. - 2. Configure HTTP and HTTPS listeners on the Gateway. - 3. Expose coffee via HTTP listener and tea via HTTPS listener. - 4. Create two CronJobs to re-rollout backends: - 1. Coffee - every minute for an hour every 6 hours - 2. Tea - every minute for an hour every 6 hours, 3 hours apart from coffee. - 5. Configure Prometheus on GKE to pick up NGF metrics (NB: Ensure that the `app.kubernetes.io/name` label matches - your NGF deployment). - - ```shell - kubectl apply -f files - ``` - -5. In Tester VMs, update `/etc/hosts` to have an entry with the External IP of the NGF Service (`10.128.0.10` in this - case): - - ```text - 10.128.0.10 cafe.example.com - ``` - -6. In Tester VMs, start a tmux session (this is needed so that even if you disconnect from the VM, any launched command - will keep running): - - ```shell - tmux - ``` - -7. In First VM, start wrk for 4 days for coffee via HTTP: - - ```shell - wrk -t2 -c100 -d96h http://cafe.example.com/coffee - ``` - -8. In Second VM, start wrk for 4 days for tea via HTTPS: - - ```shell - wrk -t2 -c100 -d96h https://cafe.example.com/tea - ``` - -Notes: - -- The updated coffee and tea backends in cafe.yaml include extra configuration for zero time upgrades, so that - wrk in Tester VMs doesn't get 502 from NGF. Based on https://learnk8s.io/graceful-shutdown - -### Check the Test is Running Correctly - -Check that you don't see any errors: - -1. Check that GKE exports NGF pod logs to Google Cloud Operations Logging and Prometheus metrics to Google Cloud - Monitoring. -2. Check that traffic is flowing - look at the access logs of NGINX in Google Cloud Operations Logging. -3. Check that CronJob can run. - - ```shell - kubectl create job --from=cronjob/coffee-rollout-mgr coffee-test - kubectl create job --from=cronjob/tea-rollout-mgr tea-test - ``` - -In case of errors, double check if you prepared the environment and launched the test correctly. - -### End - -- Remove CronJobs. - -## Analyze - -- Traffic - - Tester VMs (clients) - - As wrk stop, they will print output upon termination. To connect to the tmux session with wrk, - run `tmux attach -t 0` - - Check for errors, latency, RPS -- Logs - - Check the logs for errors in Google Cloud Operations Logging. - - NGF - - NGINX -- Check metrics in Google Cloud Monitoring. - - NGF - - CPU usage - - NGINX - - NGF - - Memory usage - - NGINX - - NGF - - NGINX metrics - - Reloads - -## Results - -- [1.0.0](results/1.0.0/1.0.0.md) -- [1.1.0](results/1.1.0/1.1.0.md) diff --git a/tests/longevity/results/1.0.0/1.0.0.md b/tests/results/longevity/1.0.0/1.0.0.md similarity index 100% rename from tests/longevity/results/1.0.0/1.0.0.md rename to tests/results/longevity/1.0.0/1.0.0.md diff --git a/tests/longevity/results/1.0.0/cpu.png b/tests/results/longevity/1.0.0/cpu.png similarity index 100% rename from tests/longevity/results/1.0.0/cpu.png rename to tests/results/longevity/1.0.0/cpu.png diff --git a/tests/longevity/results/1.0.0/memory.png b/tests/results/longevity/1.0.0/memory.png similarity index 100% rename from tests/longevity/results/1.0.0/memory.png rename to tests/results/longevity/1.0.0/memory.png diff --git a/tests/longevity/results/1.0.0/reload-time.png b/tests/results/longevity/1.0.0/reload-time.png similarity index 100% rename from tests/longevity/results/1.0.0/reload-time.png rename to tests/results/longevity/1.0.0/reload-time.png diff --git a/tests/longevity/results/1.0.0/reloads.png b/tests/results/longevity/1.0.0/reloads.png similarity index 100% rename from tests/longevity/results/1.0.0/reloads.png rename to tests/results/longevity/1.0.0/reloads.png diff --git a/tests/longevity/results/1.0.0/stub-status.png b/tests/results/longevity/1.0.0/stub-status.png similarity index 100% rename from tests/longevity/results/1.0.0/stub-status.png rename to tests/results/longevity/1.0.0/stub-status.png diff --git a/tests/longevity/results/1.1.0/1.1.0.md b/tests/results/longevity/1.1.0/1.1.0.md similarity index 100% rename from tests/longevity/results/1.1.0/1.1.0.md rename to tests/results/longevity/1.1.0/1.1.0.md diff --git a/tests/longevity/results/1.1.0/cpu.png b/tests/results/longevity/1.1.0/cpu.png similarity index 100% rename from tests/longevity/results/1.1.0/cpu.png rename to tests/results/longevity/1.1.0/cpu.png diff --git a/tests/longevity/results/1.1.0/memory.png b/tests/results/longevity/1.1.0/memory.png similarity index 100% rename from tests/longevity/results/1.1.0/memory.png rename to tests/results/longevity/1.1.0/memory.png diff --git a/tests/longevity/results/1.1.0/reload-time.png b/tests/results/longevity/1.1.0/reload-time.png similarity index 100% rename from tests/longevity/results/1.1.0/reload-time.png rename to tests/results/longevity/1.1.0/reload-time.png diff --git a/tests/longevity/results/1.1.0/reloads.png b/tests/results/longevity/1.1.0/reloads.png similarity index 100% rename from tests/longevity/results/1.1.0/reloads.png rename to tests/results/longevity/1.1.0/reloads.png diff --git a/tests/longevity/results/1.1.0/stub-status.png b/tests/results/longevity/1.1.0/stub-status.png similarity index 100% rename from tests/longevity/results/1.1.0/stub-status.png rename to tests/results/longevity/1.1.0/stub-status.png diff --git a/tests/scripts/create-gke-cluster.sh b/tests/scripts/create-gke-cluster.sh index 20e7c08bc..9d034e1c6 100644 --- a/tests/scripts/create-gke-cluster.sh +++ b/tests/scripts/create-gke-cluster.sh @@ -14,7 +14,9 @@ gcloud container clusters create ${GKE_CLUSTER_NAME} \ --service-account ${GKE_NODES_SERVICE_ACCOUNT} \ --enable-private-nodes \ --master-ipv4-cidr 172.16.${ip_random_digit}.32/28 \ - --metadata=block-project-ssh-keys=TRUE + --metadata=block-project-ssh-keys=TRUE \ + --monitoring=SYSTEM,POD,DEPLOYMENT \ + --logging=SYSTEM,WORKLOAD # Add current IP to GKE master control node access, if this script is not invoked during a CI run. if [ "${IS_CI}" = "false" ]; then diff --git a/tests/scripts/remote-scripts/install-deps.sh b/tests/scripts/remote-scripts/install-deps.sh index 371f75ff6..1196a1f21 100644 --- a/tests/scripts/remote-scripts/install-deps.sh +++ b/tests/scripts/remote-scripts/install-deps.sh @@ -4,7 +4,7 @@ set -e source ~/vars.env -sudo apt-get -y update && sudo apt-get -y install git make kubectl google-cloud-sdk-gke-gcloud-auth-plugin jq gnuplot && \ +sudo apt-get -y update && sudo apt-get -y install git make kubectl google-cloud-sdk-gke-gcloud-auth-plugin jq gnuplot rsync wrk && \ curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash && \ export GO_VERSION=$(curl -sSL "https://golang.org/dl/?mode=json" | jq -r '.[0].version') && \ wget https://go.dev/dl/${GO_VERSION}.linux-amd64.tar.gz && \ diff --git a/tests/scripts/remote-scripts/run-nfr-tests.sh b/tests/scripts/remote-scripts/run-nfr-tests.sh new file mode 100644 index 000000000..10b4c1ea0 --- /dev/null +++ b/tests/scripts/remote-scripts/run-nfr-tests.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +set -e + +source ~/vars.env + +echo "export PATH=$PATH:/usr/local/go/bin" >> $HOME/.profile && . $HOME/.profile + +if [ "$START_LONGEVITY" == "true" ]; then + GINKGO_LABEL="longevity-setup" +elif [ "$STOP_LONGEVITY" == "true" ]; then + GINKGO_LABEL="longevity-teardown" +fi + +cd nginx-gateway-fabric/tests && make .vm-nfr-test TAG=${TAG} PREFIX=${PREFIX} NGINX_PREFIX=${NGINX_PREFIX} NGINX_PLUS_PREFIX=${NGINX_PLUS_PREFIX} PLUS_ENABLED=${PLUS_ENABLED} GINKGO_LABEL=${GINKGO_LABEL} GINKGO_FLAGS=${GINKGO_FLAGS} PULL_POLICY=Always GW_SERVICE_TYPE=LoadBalancer GW_SVC_GKE_INTERNAL=true NGF_VERSION=${NGF_VERSION} + +if [ "$START_LONGEVITY" == "true" ]; then + suite/scripts/longevity-wrk.sh +fi diff --git a/tests/scripts/run-tests-gcp-vm.sh b/tests/scripts/run-tests-gcp-vm.sh index 2a407bfa1..4c9a8478e 100644 --- a/tests/scripts/run-tests-gcp-vm.sh +++ b/tests/scripts/run-tests-gcp-vm.sh @@ -4,8 +4,42 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) source scripts/vars.env +SCRIPT=run-tests.sh +if [ "${NFR}" = "true" ]; then + SCRIPT=run-nfr-tests.sh +fi + gcloud compute scp --zone ${GKE_CLUSTER_ZONE} --project=${GKE_PROJECT} ${SCRIPT_DIR}/vars.env username@${RESOURCE_NAME}:~ -gcloud compute ssh --zone ${GKE_CLUSTER_ZONE} --project=${GKE_PROJECT} username@${RESOURCE_NAME} --command="bash -s" < ${SCRIPT_DIR}/remote-scripts/run-tests.sh +gcloud compute ssh --zone ${GKE_CLUSTER_ZONE} --project=${GKE_PROJECT} username@${RESOURCE_NAME} \ + --command="export START_LONGEVITY=${START_LONGEVITY} &&\ + export STOP_LONGEVITY=${STOP_LONGEVITY} &&\ + bash -s" < ${SCRIPT_DIR}/remote-scripts/${SCRIPT} + +if [ "${NFR}" = "true" ]; then + gcloud compute scp --zone ${GKE_CLUSTER_ZONE} --project=${GKE_PROJECT} --recurse username@${RESOURCE_NAME}:~/nginx-gateway-fabric/tests/results . +fi + +## If tearing down the longevity test, we need to collect logs from gcloud and add to the results +if [ "${STOP_LONGEVITY}" = "true" ]; then + version=${NGF_VERSION} + if [ "$version" = "" ]; then + version=${TAG} + fi + + results="${SCRIPT_DIR}/../results/longevity/$version/$version.md" + printf "\n## Error Logs\n\n" >> $results + + ## ngf error logs + ngfErrText=$(gcloud logging read --project=${GKE_PROJECT} 'resource.labels.cluster_name='"${RESOURCE_NAME}"' AND resource.type=k8s_container AND resource.labels.container_name=nginx-gateway AND labels."k8s-pod/app_kubernetes_io/instance"=ngf-longevity AND severity=ERROR AND SEARCH("error")' --format "value(textPayload)") + ngfErrJSON=$(gcloud logging read --project=${GKE_PROJECT} 'resource.labels.cluster_name='"${RESOURCE_NAME}"' AND resource.type=k8s_container AND resource.labels.container_name=nginx-gateway AND labels."k8s-pod/app_kubernetes_io/instance"=ngf-longevity AND severity=ERROR AND SEARCH("error")' --format "value(jsonPayload)") + printf "### nginx-gateway\n$ngfErrText\n$ngfErrJSON\n\n" >> $results + + ## nginx error logs + ngxErr=$(gcloud logging read --project=${GKE_PROJECT} 'resource.labels.cluster_name='"${RESOURCE_NAME}"' AND resource.type=k8s_container AND resource.labels.container_name=nginx AND labels."k8s-pod/app_kubernetes_io/instance"=ngf-longevity AND severity=ERROR AND SEARCH("`[warn]`") OR SEARCH("`[error]`") OR SEARCH("`[emerg]`")' --format "value(textPayload)") + printf "### nginx\n$ngxErr\n\n" >> $results -gcloud compute scp --zone ${GKE_CLUSTER_ZONE} --project=${GKE_PROJECT} --recurse username@${RESOURCE_NAME}:~/nginx-gateway-fabric/tests/results . + ## nginx non-200 responses (also filter out 499 since wrk cancels connections) + ngxNon200=$(gcloud logging read --project=${GKE_PROJECT} 'resource.labels.cluster_name='"${RESOURCE_NAME}"' AND resource.type=k8s_container AND resource.labels.container_name=nginx AND labels."k8s-pod/app_kubernetes_io/instance"=ngf-longevity AND "GET" "HTTP/1.1" -"200" -"499" -"client prematurely closed connection"' --format "value(textPayload)") + printf "$ngxNon200\n\n" >> $results +fi diff --git a/tests/scripts/sync-files-to-vm.sh b/tests/scripts/sync-files-to-vm.sh new file mode 100755 index 000000000..c7862c205 --- /dev/null +++ b/tests/scripts/sync-files-to-vm.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +source scripts/vars.env + +NGF_DIR=$(dirname "$PWD") + +gcloud compute config-ssh --ssh-config-file ngf-gcp.ssh > /dev/null + +rsync -ave 'ssh -F ngf-gcp.ssh' ${NGF_DIR} username@${RESOURCE_NAME}.${GKE_CLUSTER_ZONE}.${GKE_PROJECT}:~ diff --git a/tests/suite/dataplane_perf_test.go b/tests/suite/dataplane_perf_test.go index 09f7a6748..9af85a1b2 100644 --- a/tests/suite/dataplane_perf_test.go +++ b/tests/suite/dataplane_perf_test.go @@ -17,7 +17,7 @@ import ( "github.com/nginxinc/nginx-gateway-fabric/tests/framework" ) -var _ = Describe("Dataplane performance", Ordered, Label("performance"), func() { +var _ = Describe("Dataplane performance", Ordered, Label("nfr", "performance"), func() { files := []string{ "dp-perf/coffee.yaml", "dp-perf/gateway.yaml", diff --git a/tests/suite/longevity_test.go b/tests/suite/longevity_test.go new file mode 100644 index 000000000..0f1382620 --- /dev/null +++ b/tests/suite/longevity_test.go @@ -0,0 +1,97 @@ +package suite + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + core "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/nginxinc/nginx-gateway-fabric/tests/framework" +) + +// Longevity test is an NFR test, but does not include the "nfr" label. It needs to run on its own, +// outside of the scope of the other NFR tests. This is because it's a long-term test whose environment +// shouldn't be torn down. +var _ = Describe("Longevity", Label("longevity-setup", "longevity-teardown"), func() { + var ( + files = []string{ + "longevity/cafe.yaml", + "longevity/cafe-secret.yaml", + "longevity/gateway.yaml", + "longevity/cafe-routes.yaml", + "longevity/cronjob.yaml", + } + promFile = []string{ + "longevity/prom.yaml", + } + + ns = &core.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: "longevity", + }, + } + + labelFilter = GinkgoLabelFilter() + ) + + BeforeEach(func() { + if !strings.Contains(labelFilter, "longevity") { + Skip("skipping longevity test unless 'longevity' label is explicitly defined when running") + } + }) + + It("sets up the longevity test", Label("longevity-setup"), func() { + if !strings.Contains(labelFilter, "longevity-setup") { + Skip("'longevity-setup' label not specified; skipping...") + } + + Expect(resourceManager.Apply([]client.Object{ns})).To(Succeed()) + Expect(resourceManager.ApplyFromFiles(files, ns.Name)).To(Succeed()) + Expect(resourceManager.ApplyFromFiles(promFile, ngfNamespace)).To(Succeed()) + Expect(resourceManager.WaitForAppsToBeReady(ns.Name)).To(Succeed()) + }) + + It("collects results", Label("longevity-teardown"), func() { + if !strings.Contains(labelFilter, "longevity-teardown") { + Skip("'longevity-teardown' label not specified; skipping...") + } + + resultsDir, err := framework.CreateResultsDir("longevity", version) + Expect(err).ToNot(HaveOccurred()) + + filename := filepath.Join(resultsDir, fmt.Sprintf("%s.md", version)) + resultsFile, err := framework.CreateResultsFile(filename) + Expect(err).ToNot(HaveOccurred()) + defer resultsFile.Close() + + Expect(framework.WriteSystemInfoToFile(resultsFile, clusterInfo, *plusEnabled)).To(Succeed()) + + // gather wrk output + homeDir, err := os.UserHomeDir() + Expect(err).ToNot(HaveOccurred()) + + Expect(framework.WriteContent(resultsFile, "\n## Traffic\n")) + Expect(writeTrafficResults(resultsFile, homeDir, "coffee.txt", "HTTP")).To(Succeed()) + Expect(writeTrafficResults(resultsFile, homeDir, "tea.txt", "HTTPS")).To(Succeed()) + + Expect(resourceManager.DeleteFromFiles(files, ns.Name)).To(Succeed()) + Expect(resourceManager.Delete([]client.Object{ns})).To(Succeed()) + }) +}) + +func writeTrafficResults(resultsFile *os.File, homeDir, filename, testname string) error { + file := fmt.Sprintf("%s/%s", homeDir, filename) + content, err := os.ReadFile(file) + if err != nil { + return err + } + + formattedContent := fmt.Sprintf("%s:\n\n```text\n%s```\n", testname, string(content)) + return framework.WriteContent(resultsFile, formattedContent) +} diff --git a/tests/longevity/manifests/cafe-routes.yaml b/tests/suite/manifests/longevity/cafe-routes.yaml similarity index 100% rename from tests/longevity/manifests/cafe-routes.yaml rename to tests/suite/manifests/longevity/cafe-routes.yaml diff --git a/tests/longevity/manifests/cafe-secret.yaml b/tests/suite/manifests/longevity/cafe-secret.yaml similarity index 100% rename from tests/longevity/manifests/cafe-secret.yaml rename to tests/suite/manifests/longevity/cafe-secret.yaml diff --git a/tests/longevity/manifests/cafe.yaml b/tests/suite/manifests/longevity/cafe.yaml similarity index 100% rename from tests/longevity/manifests/cafe.yaml rename to tests/suite/manifests/longevity/cafe.yaml diff --git a/tests/longevity/manifests/cronjob.yaml b/tests/suite/manifests/longevity/cronjob.yaml similarity index 86% rename from tests/longevity/manifests/cronjob.yaml rename to tests/suite/manifests/longevity/cronjob.yaml index 234ff903d..1f7511cf3 100644 --- a/tests/longevity/manifests/cronjob.yaml +++ b/tests/suite/manifests/longevity/cronjob.yaml @@ -2,13 +2,11 @@ apiVersion: v1 kind: ServiceAccount metadata: name: rollout-mgr - namespace: default --- apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: name: rollout-mgr - namespace: default rules: - apiGroups: - "apps" @@ -21,7 +19,6 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: name: rollout-mgr - namespace: default roleRef: apiGroup: rbac.authorization.k8s.io kind: Role @@ -29,13 +26,11 @@ roleRef: subjects: - kind: ServiceAccount name: rollout-mgr - namespace: default --- apiVersion: batch/v1 kind: CronJob metadata: name: coffee-rollout-mgr - namespace: default spec: schedule: "* */6 * * *" # every minute every 6 hours jobTemplate: @@ -58,14 +53,13 @@ spec: -H "Authorization: Bearer $TOKEN" \ -H "Content-type: application/merge-patch+json" \ --data-raw "{\"spec\": {\"template\": {\"metadata\": {\"annotations\": {\"kubectl.kubernetes.io/restartedAt\": \"$RESTARTED_AT\"}}}}}" \ - "https://kubernetes/apis/apps/v1/namespaces/default/deployments/coffee?fieldManager=kubectl-rollout" 2>&1 + "https://kubernetes.default/apis/apps/v1/namespaces/default/deployments/coffee?fieldManager=kubectl-rollout" 2>&1 restartPolicy: OnFailure --- apiVersion: batch/v1 kind: CronJob metadata: name: tea-rollout-mgr - namespace: default spec: schedule: "* 3,9,15,21 * * *" # every minute every 6 hours, 3 hours apart from coffee jobTemplate: @@ -88,5 +82,5 @@ spec: -H "Authorization: Bearer $TOKEN" \ -H "Content-type: application/merge-patch+json" \ --data-raw "{\"spec\": {\"template\": {\"metadata\": {\"annotations\": {\"kubectl.kubernetes.io/restartedAt\": \"$RESTARTED_AT\"}}}}}" \ - "https://kubernetes/apis/apps/v1/namespaces/default/deployments/tea?fieldManager=kubectl-rollout" 2>&1 + "https://kubernetes.default/apis/apps/v1/namespaces/default/deployments/tea?fieldManager=kubectl-rollout" 2>&1 restartPolicy: OnFailure diff --git a/tests/longevity/manifests/gateway.yaml b/tests/suite/manifests/longevity/gateway.yaml similarity index 100% rename from tests/longevity/manifests/gateway.yaml rename to tests/suite/manifests/longevity/gateway.yaml diff --git a/tests/longevity/manifests/prom.yaml b/tests/suite/manifests/longevity/prom.yaml similarity index 79% rename from tests/longevity/manifests/prom.yaml rename to tests/suite/manifests/longevity/prom.yaml index e5d35fae7..24de26577 100644 --- a/tests/longevity/manifests/prom.yaml +++ b/tests/suite/manifests/longevity/prom.yaml @@ -6,7 +6,7 @@ metadata: spec: selector: matchLabels: - app.kubernetes.io/name: nginx-gateway + app.kubernetes.io/name: nginx-gateway-fabric endpoints: - port: metrics interval: 30s diff --git a/tests/suite/sample_test.go b/tests/suite/sample_test.go index 0e6ce59f1..3996c6764 100644 --- a/tests/suite/sample_test.go +++ b/tests/suite/sample_test.go @@ -14,7 +14,7 @@ import ( "github.com/nginxinc/nginx-gateway-fabric/tests/framework" ) -var _ = Describe("Basic test example", func() { +var _ = Describe("Basic test example", Label("functional"), func() { files := []string{ "hello/hello.yaml", "hello/gateway.yaml", diff --git a/tests/suite/scripts/longevity-wrk.sh b/tests/suite/scripts/longevity-wrk.sh new file mode 100755 index 000000000..58312a1e1 --- /dev/null +++ b/tests/suite/scripts/longevity-wrk.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +SVC_IP=$(kubectl -n nginx-gateway get svc ngf-longevity-nginx-gateway-fabric -o jsonpath='{.status.loadBalancer.ingress[0].ip}') + +echo "${SVC_IP} cafe.example.com" | sudo tee -a /etc/hosts + +nohup wrk -t2 -c100 -d96h http://cafe.example.com/coffee &> ~/coffee.txt & + +nohup wrk -t2 -c100 -d96h https://cafe.example.com/tea &> ~/tea.txt & diff --git a/tests/suite/system_suite_test.go b/tests/suite/system_suite_test.go index 8d2af38b5..f758bd98b 100644 --- a/tests/suite/system_suite_test.go +++ b/tests/suite/system_suite_test.go @@ -68,6 +68,7 @@ var ( address string version string clusterInfo framework.ClusterInfo + skipNFRTests bool ) const ( @@ -76,9 +77,11 @@ const ( ) type setupConfig struct { + releaseName string chartPath string gwAPIVersion string deploy bool + nfr bool } func setup(cfg setupConfig, extraInstallArgs ...string) { @@ -110,12 +113,30 @@ func setup(cfg setupConfig, extraInstallArgs ...string) { clusterInfo, err = resourceManager.GetClusterInfo() Expect(err).ToNot(HaveOccurred()) + if cfg.nfr && !clusterInfo.IsGKE { + skipNFRTests = true + Skip("NFR tests can only run in GKE") + } + + if cfg.nfr && *serviceType != "LoadBalancer" { + skipNFRTests = true + Skip("GW_SERVICE_TYPE must be 'LoadBalancer' for NFR tests") + } + + if *versionUnderTest != "" { + version = *versionUnderTest + } else if *imageTag != "" { + version = *imageTag + } else { + version = "edge" + } + if !cfg.deploy { return } installCfg := framework.InstallationConfig{ - ReleaseName: releaseName, + ReleaseName: cfg.releaseName, Namespace: ngfNamespace, ChartPath: cfg.chartPath, ServiceType: *serviceType, @@ -131,14 +152,6 @@ func setup(cfg setupConfig, extraInstallArgs ...string) { installCfg.ImagePullPolicy = *imagePullPolicy } - if *versionUnderTest != "" { - version = *versionUnderTest - } else if *imageTag != "" { - version = *imageTag - } else { - version = "edge" - } - output, err := framework.InstallGatewayAPI(k8sClient, cfg.gwAPIVersion, *k8sVersion) Expect(err).ToNot(HaveOccurred(), string(output)) @@ -163,13 +176,13 @@ func setup(cfg setupConfig, extraInstallArgs ...string) { Expect(err).ToNot(HaveOccurred()) } -func teardown() { +func teardown(relName string) { if portFwdPort != 0 { portForwardStopCh <- struct{}{} } cfg := framework.InstallationConfig{ - ReleaseName: releaseName, + ReleaseName: relName, Namespace: ngfNamespace, } @@ -204,21 +217,50 @@ var _ = BeforeSuite(func() { localChartPath = filepath.Join(basepath, "deploy/helm-chart") cfg := setupConfig{ + releaseName: releaseName, chartPath: localChartPath, gwAPIVersion: *gatewayAPIVersion, deploy: true, } - // If we are running the upgrade test only, then skip the initial deployment. - // The upgrade test will deploy its own version of NGF. - suiteConfig, _ := GinkgoConfiguration() - if suiteConfig.LabelFilter == "upgrade" { + labelFilter := GinkgoLabelFilter() + cfg.nfr = isNFR(labelFilter) + + // Skip deployment if: + // - running upgrade test (this test will deploy its own version) + // - running longevity teardown (deployment will already exist) + if strings.Contains(labelFilter, "upgrade") || strings.Contains(labelFilter, "longevity-teardown") { cfg.deploy = false } + // use a different release name for longevity to allow us to filter on a specific label when collecting + // logs from GKE + if strings.Contains(labelFilter, "longevity") { + cfg.releaseName = "ngf-longevity" + } + setup(cfg) }) var _ = AfterSuite(func() { - teardown() + if skipNFRTests { + Skip("") + } + + labelFilter := GinkgoLabelFilter() + if !strings.Contains(labelFilter, "longevity-setup") { + relName := releaseName + if strings.Contains(labelFilter, "longevity-teardown") { + relName = "ngf-longevity" + } + + teardown(relName) + } }) + +func isNFR(labelFilter string) bool { + return strings.Contains(labelFilter, "nfr") || + strings.Contains(labelFilter, "longevity") || + strings.Contains(labelFilter, "performance") || + strings.Contains(labelFilter, "upgrade") +} diff --git a/tests/suite/upgrade_test.go b/tests/suite/upgrade_test.go index 3fa71bcc6..0e5983401 100644 --- a/tests/suite/upgrade_test.go +++ b/tests/suite/upgrade_test.go @@ -26,7 +26,7 @@ import ( // This test installs the latest released version of NGF, then upgrades to the edge version (or dev version). // During the upgrade, traffic is continuously sent to ensure no downtime. // We also check that the leader election lease has been updated, and that Gateway updates are processed. -var _ = Describe("Upgrade testing", Label("upgrade"), func() { +var _ = Describe("Upgrade testing", Label("nfr", "upgrade"), func() { var ( files = []string{ "ngf-upgrade/cafe.yaml", @@ -44,23 +44,12 @@ var _ = Describe("Upgrade testing", Label("upgrade"), func() { valuesFile = "manifests/ngf-upgrade/values.yaml" resultsFile *os.File resultsDir string - skipped bool ) BeforeEach(func() { - if !clusterInfo.IsGKE { - skipped = true - Skip("Upgrade tests can only run in GKE") - } - - if *serviceType != "LoadBalancer" { - skipped = true - Skip("GW_SERVICE_TYPE must be 'LoadBalancer' for upgrade tests") - } - // this test is unique in that it needs to install the previous version of NGF, // so we need to uninstall the version installed at the suite level, then install the custom version - teardown() + teardown(releaseName) cfg := setupConfig{ chartPath: "oci://ghcr.io/nginxinc/charts/nginx-gateway-fabric", @@ -84,10 +73,6 @@ var _ = Describe("Upgrade testing", Label("upgrade"), func() { }) AfterEach(func() { - if skipped { - Skip("") - } - Expect(resourceManager.DeleteFromFiles(files, ns.Name)).To(Succeed()) Expect(resourceManager.Delete([]client.Object{ns})).To(Succeed()) resultsFile.Close()