diff --git a/.gitignore b/.gitignore index 3524906ad..7dd2bdfe0 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ /*.tar.gz ci.env pr.env +junit*.xml diff --git a/Makefile b/Makefile index 646c09a40..e5ff59a8d 100644 --- a/Makefile +++ b/Makefile @@ -75,7 +75,6 @@ ifneq ($(BUILD_TAGS), "") BUILD_TAGS:=-tags "$(BUILD_TAGS)" endif - vet: GO111MODULE=on go list -mod vendor $(BUILD_TAGS) ./... | \ grep -v "./vendor/*" | \ @@ -107,7 +106,16 @@ Dockerfile: Dockerfile.in sed -e 's|@BASEIMAGE@|$(BASEIMAGE)|g' $< >$@ test: vet fmt - GO111MODULE=on go test -mod vendor -timeout=1m -v -race $(BUILD_TAGS) ./... + GO111MODULE=on go test -mod vendor -timeout=1m -v -race -short $(BUILD_TAGS) ./... + +e2e-test: vet fmt build-tar + GO111MODULE=on go test -mod vendor -timeout=10m -v $(BUILD_TAGS) \ + ./test/e2e/metriconly/... \ + -project=$(PROJECT) -zone=$(ZONE) \ + -image=$(VM_IMAGE) -image-project=$(IMAGE_PROJECT) \ + -ssh-user=$(SSH_USER) -ssh-key=$(SSH_KEY) \ + -npd-build-tar=`pwd`/$(TARBALL) \ + -artifacts-dir=$(ARTIFACTS) build-binaries: ./bin/node-problem-detector ./bin/log-counter @@ -115,7 +123,7 @@ build-container: build-binaries Dockerfile docker build -t $(IMAGE) . build-tar: ./bin/node-problem-detector ./bin/log-counter - tar -zcvf $(TARBALL) bin/ config/ + tar -zcvf $(TARBALL) bin/ config/ test/e2e-install.sh sha1sum $(TARBALL) md5sum $(TARBALL) diff --git a/config/systemd/node-problem-detector-metric-only.service b/config/systemd/node-problem-detector-metric-only.service new file mode 100644 index 000000000..142fee176 --- /dev/null +++ b/config/systemd/node-problem-detector-metric-only.service @@ -0,0 +1,15 @@ +[Unit] +Description=Node problem detector +Wants=local-fs.target +After=local-fs.target + +[Service] +Restart=always +RestartSec=10 +ExecStart=/home/kubernetes/bin/node-problem-detector --v=2 --logtostderr --enable-k8s-exporter=false \ + --config.system-log-monitor=/home/kubernetes/node-problem-detector/config/kernel-monitor.json,/home/kubernetes/node-problem-detector/config/docker-monitor.json,/home/kubernetes/node-problem-detector/config/systemd-monitor.json \ + --config.custom-plugin-monitor=/home/kubernetes/node-problem-detector/config/kernel-monitor-counter.json,/home/kubernetes/node-problem-detector/config/systemd-monitor-counter.json \ + --config.system-stats-monitor=/home/kubernetes/node-problem-detector/config/system-stats-monitor.json + +[Install] +WantedBy=multi-user.target diff --git a/pkg/util/metrics/fakes.go b/pkg/util/metrics/fakes.go index b98420303..53b3beea5 100644 --- a/pkg/util/metrics/fakes.go +++ b/pkg/util/metrics/fakes.go @@ -21,17 +21,6 @@ import ( "reflect" ) -// Int64MetricRepresentation represents a snapshot of an int64 metrics. -// This is used for inspecting fake metrics. -type Int64MetricRepresentation struct { - // Name is the metric name. - Name string - // Labels contains all metric labels in key-value pair format. - Labels map[string]string - // Value is the value of the metric. - Value int64 -} - // Int64MetricInterface is used to create test double for Int64Metric. type Int64MetricInterface interface { // Record records a measurement for the metric, with provided tags as metric labels. diff --git a/pkg/util/metrics/helpers.go b/pkg/util/metrics/helpers.go index 14e6f5d64..3cc6952ff 100644 --- a/pkg/util/metrics/helpers.go +++ b/pkg/util/metrics/helpers.go @@ -18,8 +18,11 @@ package metrics import ( "context" "fmt" + "strings" "sync" + pcm "github.com/prometheus/client_model/go" + "github.com/prometheus/common/expfmt" "go.opencensus.io/stats" "go.opencensus.io/stats/view" "go.opencensus.io/tag" @@ -34,12 +37,6 @@ func init() { tagMapMutex.Unlock() } -// Int64Metric represents an int64 metric. -type Int64Metric struct { - name string - measure *stats.Int64Measure -} - // Aggregation defines how measurements should be aggregated into data points. type Aggregation string @@ -50,6 +47,23 @@ const ( Sum Aggregation = "Sum" ) +// Int64MetricRepresentation represents a snapshot of an int64 metrics. +// This is used for inspecting metric internals. +type Int64MetricRepresentation struct { + // Name is the metric name. + Name string + // Labels contains all metric labels in key-value pair format. + Labels map[string]string + // Value is the value of the metric. + Value int64 +} + +// Int64Metric represents an int64 metric. +type Int64Metric struct { + name string + measure *stats.Int64Measure +} + // NewInt64Metric create a Int64Metric metric, returns nil when name is empty. func NewInt64Metric(name string, description string, unit string, aggregation Aggregation, tagNames []string) (*Int64Metric, error) { if name == "" { @@ -106,6 +120,17 @@ func (metric *Int64Metric) Record(tags map[string]string, measurement int64) err metric.measure.M(measurement)) } +// Float64MetricRepresentation represents a snapshot of a float64 metrics. +// This is used for inspecting metric internals. +type Float64MetricRepresentation struct { + // Name is the metric name. + Name string + // Labels contains all metric labels in key-value pair format. + Labels map[string]string + // Value is the value of the metric. + Value float64 +} + // Float64Metric represents an float64 metric. type Float64Metric struct { name string @@ -187,3 +212,66 @@ func getTagKeysFromNames(tagNames []string) ([]tag.Key, error) { } return tagKeys, nil } + +// ParsePrometheusMetrics parses Prometheus formatted metrics into metrics under Float64MetricRepresentation. +// +// Note: Prometheus's go library stores all counter/gauge-typed metric values under float64. +func ParsePrometheusMetrics(metricsText string) ([]Float64MetricRepresentation, error) { + var metrics []Float64MetricRepresentation + + var textParser expfmt.TextParser + metricFamilies, err := textParser.TextToMetricFamilies(strings.NewReader(metricsText)) + if err != nil { + return metrics, err + } + + for _, metricFamily := range metricFamilies { + for _, metric := range metricFamily.Metric { + labels := make(map[string]string) + for _, labelPair := range metric.Label { + labels[*labelPair.Name] = *labelPair.Value + } + + var value float64 + if *metricFamily.Type == pcm.MetricType_COUNTER { + value = *metric.Counter.Value + } else if *metricFamily.Type == pcm.MetricType_GAUGE { + value = *metric.Gauge.Value + } else { + return metrics, fmt.Errorf("unexpected MetricType %s for metric %s", + pcm.MetricType_name[int32(*metricFamily.Type)], *metricFamily.Name) + } + + metrics = append(metrics, Float64MetricRepresentation{*metricFamily.Name, labels, value}) + } + } + + return metrics, nil +} + +// GetFloat64Metric finds the metric matching provided name and labels. +// When strictLabelMatching is set to true, the founded metric labels are identical to the provided labels; +// when strictLabelMatching is set to false, the founded metric labels are a superset of the provided labels. +func GetFloat64Metric(metrics []Float64MetricRepresentation, name string, labels map[string]string, + strictLabelMatching bool) (Float64MetricRepresentation, error) { + for _, metric := range metrics { + if metric.Name != name { + continue + } + if strictLabelMatching && len(metric.Labels) != len(labels) { + continue + } + sameLabels := true + for key, value := range labels { + if metric.Labels[key] != value { + sameLabels = false + break + } + } + if !sameLabels { + continue + } + return metric, nil + } + return Float64MetricRepresentation{}, fmt.Errorf("no matching metric found") +} diff --git a/pkg/util/metrics/helpers_test.go b/pkg/util/metrics/helpers_test.go new file mode 100644 index 000000000..80625cbea --- /dev/null +++ b/pkg/util/metrics/helpers_test.go @@ -0,0 +1,152 @@ +/* +Copyright 2019 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package metrics + +import ( + "io/ioutil" + "testing" +) + +// TestPrometheusMetricsParsingAndMatching verifies the behavior of ParsePrometheusMetrics() and GetFloat64Metric(). +func TestPrometheusMetricsParsingAndMatching(t *testing.T) { + testCases := []struct { + name string + metricsTextPath string + expectedMetrics []Float64MetricRepresentation + notExpectedMetrics []Float64MetricRepresentation + strictLabelMatching bool + }{ + { + name: "Relaxed label matching", + metricsTextPath: "testdata/sample_metrics.txt", + expectedMetrics: []Float64MetricRepresentation{ + // Metric with no label. + { + Name: "host_uptime", + Labels: map[string]string{}, + }, + // Metric with partial label. + { + Name: "host_uptime", + Labels: map[string]string{"kernel_version": "4.14.127+"}, + }, + { + Name: "disk_avg_queue_len", + Labels: map[string]string{"device": "sda1"}, + }, + { + Name: "disk_avg_queue_len", + Labels: map[string]string{"device": "sda8"}, + }, + }, + notExpectedMetrics: []Float64MetricRepresentation{ + // Metric with non-existant label. + { + Name: "host_uptime", + Labels: map[string]string{"non-existant-version": "0.0.1"}, + }, + // Metric with incorrect label. + { + Name: "host_uptime", + Labels: map[string]string{"kernel_version": "mismatched-version"}, + }, + // Non-exsistant metric. + { + Name: "host_downtime", + Labels: map[string]string{}, + }, + }, + strictLabelMatching: false, + }, + { + name: "Strict label matching", + metricsTextPath: "testdata/sample_metrics.txt", + expectedMetrics: []Float64MetricRepresentation{ + { + Name: "host_uptime", + Labels: map[string]string{"kernel_version": "4.14.127+", "os_version": "cos 73-11647.217.0"}, + }, + { + Name: "problem_counter", + Labels: map[string]string{"reason": "DockerHung"}, + }, + { + Name: "problem_counter", + Labels: map[string]string{"reason": "OOMKilling"}, + }, + }, + notExpectedMetrics: []Float64MetricRepresentation{ + // Metric with incomplete label. + { + Name: "host_uptime", + Labels: map[string]string{"kernel_version": "4.14.127+"}, + }, + // Metric with missing label. + { + Name: "host_uptime", + Labels: map[string]string{}, + }, + // Metric with non-existant label. + { + Name: "host_uptime", + Labels: map[string]string{"non-existant-version": "0.0.1"}, + }, + // Metric with incorrect label. + { + Name: "host_uptime", + Labels: map[string]string{"kernel_version": "mismatched-version"}, + }, + // Non-exsistant metric. + { + Name: "host_downtime", + Labels: map[string]string{}, + }, + }, + strictLabelMatching: true, + }, + } + + for _, test := range testCases { + t.Run(test.name, func(t *testing.T) { + b, err := ioutil.ReadFile(test.metricsTextPath) + if err != nil { + t.Errorf("Unexpected error reading file %s: %v", test.metricsTextPath, err) + } + metricsText := string(b) + + metrics, err := ParsePrometheusMetrics(metricsText) + if err != nil { + t.Errorf("Unexpected error parsing NPD metrics: %v\nMetrics text: %s\n", err, metricsText) + } + + for _, expectedMetric := range test.expectedMetrics { + _, err = GetFloat64Metric(metrics, expectedMetric.Name, expectedMetric.Labels, test.strictLabelMatching) + if err != nil { + t.Errorf("Failed to find metric %v in these metrics %v.\nMetrics text: %s\n", + expectedMetric, metrics, metricsText) + } + } + + for _, notExpectedMetric := range test.notExpectedMetrics { + _, err = GetFloat64Metric(metrics, notExpectedMetric.Name, notExpectedMetric.Labels, test.strictLabelMatching) + if err == nil { + t.Errorf("Unexpected metric %v found in these metrics %v.\nMetrics text: %s\n", + notExpectedMetric, metrics, metricsText) + } + } + }) + } +} diff --git a/pkg/util/metrics/testdata/sample_metrics.txt b/pkg/util/metrics/testdata/sample_metrics.txt new file mode 100644 index 000000000..6fc6dd286 --- /dev/null +++ b/pkg/util/metrics/testdata/sample_metrics.txt @@ -0,0 +1,45 @@ +# HELP disk_avg_queue_len The average queue length on the disk +# TYPE disk_avg_queue_len gauge +disk_avg_queue_len{device="sda"} 3.388908266480642 +disk_avg_queue_len{device="sda1"} 6.53953488372093 +disk_avg_queue_len{device="sda8"} 3.404255319148936 +# HELP disk_io_time The IO time spent on the disk +# TYPE disk_io_time gauge +disk_io_time{device="sda"} 8601 +disk_io_time{device="sda1"} 430 +disk_io_time{device="sda8"} 47 +# HELP disk_weighted_io The weighted IO on the disk +# TYPE disk_weighted_io gauge +disk_weighted_io{device="sda"} 29148 +disk_weighted_io{device="sda1"} 2812 +disk_weighted_io{device="sda8"} 160 +# HELP host_uptime The uptime of the operating system +# TYPE host_uptime gauge +host_uptime{kernel_version="4.14.127+",os_version="cos 73-11647.217.0"} 81 +# HELP problem_counter Number of times a specific type of problem have occurred. +# TYPE problem_counter counter +problem_counter{reason="AUFSUmountHung"} 0 +problem_counter{reason="ContainerdStart"} 1 +problem_counter{reason="CorruptDockerImage"} 0 +problem_counter{reason="CorruptDockerOverlay2"} 0 +problem_counter{reason="DockerHung"} 0 +problem_counter{reason="DockerStart"} 1 +problem_counter{reason="FilesystemIsReadOnly"} 0 +problem_counter{reason="FrequentContainerdRestart"} 0 +problem_counter{reason="FrequentDockerRestart"} 0 +problem_counter{reason="FrequentKubeletRestart"} 0 +problem_counter{reason="KernelOops"} 0 +problem_counter{reason="KubeletStart"} 0 +problem_counter{reason="OOMKilling"} 0 +problem_counter{reason="TaskHung"} 0 +problem_counter{reason="UnregisterNetDevice"} 0 +# HELP problem_gauge Whether a specific type of problem is affecting the node or not. +# TYPE problem_gauge gauge +problem_gauge{reason="AUFSUmountHung",type="KernelDeadlock"} 0 +problem_gauge{reason="CorruptDockerOverlay2",type="CorruptDockerOverlay2"} 0 +problem_gauge{reason="DockerHung",type="KernelDeadlock"} 0 +problem_gauge{reason="FilesystemIsReadOnly",type="ReadonlyFilesystem"} 0 +problem_gauge{reason="FrequentContainerdRestart",type="FrequentContainerdRestart"} 0 +problem_gauge{reason="FrequentDockerRestart",type="FrequentDockerRestart"} 0 +problem_gauge{reason="FrequentKubeletRestart",type="FrequentKubeletRestart"} 0 +problem_gauge{reason="UnregisterNetDevice",type="FrequentUnregisterNetDevice"} 0 diff --git a/test/e2e-install.sh b/test/e2e-install.sh new file mode 100755 index 000000000..ee5d29a4f --- /dev/null +++ b/test/e2e-install.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash + +# Copyright 2019 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script is for installing node problem detector (NPD) on a running node +# in metric-only mode, as a setup for NPD e2e tests. + +set -o errexit +set -o nounset +set -o pipefail + +readonly BIN_DIR=/home/kubernetes/bin +readonly CONFIG_DIR=/home/kubernetes/node-problem-detector/config + +function print-help() { + echo "Usage: e2e-install.sh [flags] [command]" + echo + echo "Available flags:" + echo " -t [TARBALL] Specify the path of the NPD tarball (generated by 'make build-tar')." + echo + echo "Available commands:" + echo " help Print this help message" + echo " install Installs NPD to the this machine" + echo + echo "Examples:" + echo " e2e-install.sh help" + echo " e2e-install.sh -t /tmp/npd.tar.gz install" +} + +function install-npd() { + if [[ -z "${TARBALL}" ]]; then + echo "ERROR: tarball flag is missing." + exit 1 + fi + + readonly workdir=$(mktemp -d) + tar -xf "${TARBALL}" --directory "${workdir}" + + echo "Preparing NPD binary directory." + mkdir -p "${BIN_DIR}" + mount --bind "${BIN_DIR}" "${BIN_DIR}" + # Below remount is to work around COS's noexec mount on /home. + mount -o remount,exec "${BIN_DIR}" + + echo "Installing NPD binary." + cp "${workdir}"/bin/node-problem-detector "${BIN_DIR}" + + echo "Installing log-counter binary." + cp "${workdir}"/bin/log-counter "${BIN_DIR}" + + echo "Installing NPD configurations." + mkdir -p "${CONFIG_DIR}" + cp -r "${workdir}"/config/* "${CONFIG_DIR}" + + echo "Installing NPD systemd service." + cp "${workdir}"/config/systemd/node-problem-detector-metric-only.service /etc/systemd/system/node-problem-detector.service + + rm -rf "${workdir}" + + # Start systemd service. + echo "Starting NPD systemd service." + systemctl daemon-reload + systemctl stop node-problem-detector.service || true + systemctl start node-problem-detector.service +} + +function main() { + case ${1:-} in + help) print-help;; + install) install-npd;; + *) print-help;; + esac +} + +TARBALL="" + +while getopts "t:" opt; do + case ${opt} in + t) TARBALL="${OPTARG}";; + esac +done +shift "$((OPTIND-1))" + + +main "${@}" \ No newline at end of file diff --git a/test/e2e/README.md b/test/e2e/README.md new file mode 100644 index 000000000..2c9e23dab --- /dev/null +++ b/test/e2e/README.md @@ -0,0 +1,25 @@ +# Node Problem Detector End-To-End tests + +NPD e2e tests are meant for testing the NPD on a VM environment. + +Currently the tests only support Google Compute Engine (GCE) environment. Support for other vendors can be added in future. + +## Prerequisites + +1. Setup [Google Application Default Credentials](https://developers.google.com/identity/protocols/application-default-credentials), which is [required for authentication](https://godoc.org/google.golang.org/api/compute/v1#hdr-Creating_a_client) by the Compute Engine API. +2. Setup a [project-wide SSH key](https://cloud.google.com/compute/docs/instances/adding-removing-ssh-keys#project-wide) that can be used to SSH into the GCE VMs. + +## Running tests + +From the node-problem-detector base directory, run: + +``` +export GOOGLE_APPLICATION_CREDENTIALS=[YOUR_ADC_PATH:~/.config/gcloud/application_default_credentials.json] +export ZONE=[ANY_GCE_ZONE:us-central1-a] +export PROJECT=[YOUR_PROJECT_ID] +export VM_IMAGE=[TESTED_OS_IMAGE:cos-73-11647-217-0] +export IMAGE_PROJECT=[TESTED_OS_IMAGE_PROJECT:cos-cloud] +export SSH_USER=${USER} +export SSH_KEY=~/.ssh/id_rsa +make e2e-test +``` diff --git a/test/e2e/lib/gce/gce.go b/test/e2e/lib/gce/gce.go new file mode 100644 index 000000000..7c1a99c4e --- /dev/null +++ b/test/e2e/lib/gce/gce.go @@ -0,0 +1,55 @@ +/* +Copyright 2019 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gce + +import ( + "net/http" + "time" + + "golang.org/x/oauth2" + "golang.org/x/oauth2/google" + compute "google.golang.org/api/compute/v1" +) + +// GetComputeClient creates a GCE client with a 1 minute deadline. +func GetComputeClient() (*compute.Service, error) { + const retries = 10 + const backoff = time.Second * 6 + + // Setup the gce client for provisioning instances + // Getting credentials on gce jenkins is flaky, so try a couple times + var err error + var cs *compute.Service + for i := 0; i < retries; i++ { + if i > 0 { + time.Sleep(backoff) + } + + var client *http.Client + client, err = google.DefaultClient(oauth2.NoContext, compute.ComputeScope) + if err != nil { + continue + } + + cs, err = compute.New(client) + if err != nil { + continue + } + return cs, nil + } + return nil, err +} diff --git a/test/e2e/lib/gce/instance.go b/test/e2e/lib/gce/instance.go new file mode 100644 index 000000000..6aa4a4662 --- /dev/null +++ b/test/e2e/lib/gce/instance.go @@ -0,0 +1,166 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package gce + +import ( + "fmt" + "os/exec" + "time" + + "k8s.io/node-problem-detector/test/e2e/lib/ssh" + + compute "google.golang.org/api/compute/v1" +) + +// Instance represents a GCE instance. +type Instance struct { + Name string + Zone string + Project string + MachineType string + ExternalIP string + SshKey string + SshUser string + ComputeService *compute.Service +} + +// CreateInstance creates a GCE instance with provided spec. +func CreateInstance(instance Instance, imageName string, imageProject string) (Instance, error) { + if instance.MachineType == "" { + instance.MachineType = "n1-standard-1" + } + + p, err := instance.ComputeService.Projects.Get(instance.Project).Do() + if err != nil { + return instance, fmt.Errorf("failed to get project info %q", instance.Project) + } + + i := &compute.Instance{ + Name: instance.Name, + MachineType: fmt.Sprintf("zones/%s/machineTypes/%s", instance.Zone, instance.MachineType), + NetworkInterfaces: []*compute.NetworkInterface{ + { + AccessConfigs: []*compute.AccessConfig{ + { + Type: "ONE_TO_ONE_NAT", + Name: "External NAT", + }, + }}, + }, + Disks: []*compute.AttachedDisk{ + { + AutoDelete: true, + Boot: true, + Type: "PERSISTENT", + InitializeParams: &compute.AttachedDiskInitializeParams{ + SourceImage: fmt.Sprintf("projects/%s/global/images/%s", imageProject, imageName), + DiskSizeGb: 20, + }, + }, + }, + ServiceAccounts: []*compute.ServiceAccount{ + { + Email: p.DefaultServiceAccount, + Scopes: []string{ + "https://www.googleapis.com/auth/cloud-platform", + }, + }, + }, + } + + if _, err := instance.ComputeService.Instances.Get(instance.Project, instance.Zone, instance.Name).Do(); err != nil { + op, err := instance.ComputeService.Instances.Insert(instance.Project, instance.Zone, i).Do() + if err != nil { + ret := fmt.Sprintf("could not create instance %s: API error: %v", instance.Name, err) + if op != nil { + ret = fmt.Sprintf("%s: %v", ret, op.Error) + } + return instance, fmt.Errorf(ret) + } else if op.Error != nil { + return instance, fmt.Errorf("could not create instance %s: %+v", instance.Name, op.Error) + } + } + + instanceRunning := false + for i := 0; i < 30 && !instanceRunning; i++ { + if i > 0 { + time.Sleep(time.Second * 20) + } + if instance.ExternalIP == "" { + instance.populateExternalIP() + } + + result := ssh.Run("pwd", instance.ExternalIP, instance.SshUser, instance.SshKey) + if result.SSHError != nil { + err = fmt.Errorf("SSH to instance %s failed: %s", instance.Name, result.SSHError) + continue + } + instanceRunning = true + } + + if !instanceRunning { + return instance, err + } + return instance, nil +} + +func (ins *Instance) populateExternalIP() { + gceInstance, err := ins.ComputeService.Instances.Get(ins.Project, ins.Zone, ins.Name).Do() + if err != nil { + return + } + + for i := range gceInstance.NetworkInterfaces { + ni := gceInstance.NetworkInterfaces[i] + for j := range ni.AccessConfigs { + ac := ni.AccessConfigs[j] + if len(ac.NatIP) > 0 { + ins.ExternalIP = ac.NatIP + return + } + } + } +} + +// RunCommand runs a command on the GCE instance and returns the command result. +func (ins *Instance) RunCommand(cmd string) ssh.Result { + if ins.ExternalIP == "" { + ins.populateExternalIP() + } + return ssh.Run(cmd, ins.ExternalIP, ins.SshUser, ins.SshKey) +} + +// PushFile pushs a local file to a GCE instance. +func (ins *Instance) PushFile(srcPath, destPath string) error { + if ins.ExternalIP == "" { + ins.populateExternalIP() + } + return exec.Command("scp", "-o", "StrictHostKeyChecking no", + "-i", ins.SshKey, + srcPath, fmt.Sprintf("%s@%s:%s", ins.SshUser, ins.ExternalIP, destPath)).Run() +} + +// DeleteInstance deletes a GCE instance. +func (ins *Instance) DeleteInstance() error { + if _, err := ins.ComputeService.Instances.Get(ins.Project, ins.Zone, ins.Name).Do(); err != nil { + return err + } + if _, err := ins.ComputeService.Instances.Delete(ins.Project, ins.Zone, ins.Name).Do(); err != nil { + return err + } + return nil +} diff --git a/test/e2e/lib/npd/npd.go b/test/e2e/lib/npd/npd.go new file mode 100644 index 000000000..9b5ea0d56 --- /dev/null +++ b/test/e2e/lib/npd/npd.go @@ -0,0 +1,145 @@ +/* +Copyright 2019 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package npd + +import ( + "fmt" + "io/ioutil" + "path" + "strings" + "time" + + "k8s.io/node-problem-detector/pkg/util/metrics" + "k8s.io/node-problem-detector/test/e2e/lib/gce" + + "github.com/avast/retry-go" +) + +const npdMetricsFilename = "npd_metrics.txt" +const npdLogsFilename = "npd_logs.txt" + +// SetupNPD installs NPD from the test tarball onto the provided GCE instance. +// +// Here is how it works: +// 1. SetupNPD will SCP the NPD build tarball onto the VM. +// 2. SetupNPD will extract the tarball in the VM, to expose the test/e2e-install.sh on the VM. +// 3. SetupNPD will then call the e2e-install.sh script, and feed the NPD build tarball as input. +// 4. Finally, the e2e-install.sh script will do the heavy lifting of installing NPD (setting up +// binary/config directories, setting up systemd config file, etc). +func SetupNPD(ins gce.Instance, npdBuildTar string) error { + tmpDirCmd := ins.RunCommand("mktemp -d") + if tmpDirCmd.SSHError != nil || tmpDirCmd.Code != 0 { + return fmt.Errorf("error creating temporary directory to hold NPD tarball: %v", tmpDirCmd) + } + + tmpDir := strings.TrimSuffix(tmpDirCmd.Stdout, "\n") + npdTarVMPath := tmpDir + "/npd.tar.gz" + npdExtractDir := tmpDir + "/npd" + + err := ins.PushFile(npdBuildTar, npdTarVMPath) + if err != nil { + return fmt.Errorf("error pushing local NPD build tarball %s to VM at %s: %v", npdBuildTar, npdTarVMPath, err) + } + + mkdirCmd := ins.RunCommand(fmt.Sprintf("mkdir -p %s", npdExtractDir)) + if mkdirCmd.SSHError != nil || mkdirCmd.Code != 0 { + return fmt.Errorf("error creating directory to extract NPD tarball into: %v", mkdirCmd) + } + + extractCmd := ins.RunCommand(fmt.Sprintf("tar -xf %s --directory %s", npdTarVMPath, npdExtractDir)) + if extractCmd.SSHError != nil || extractCmd.Code != 0 { + return fmt.Errorf("error extracting NPD build tarball: %v", extractCmd) + } + + installCmd := ins.RunCommand(fmt.Sprintf("sudo bash %s/test/e2e-install.sh -t %s install", npdExtractDir, npdTarVMPath)) + if installCmd.SSHError != nil || installCmd.Code != 0 { + return fmt.Errorf("error installing NPD: %v", installCmd) + } + + return nil +} + +// FetchNPDMetrics fetches and parses metrics reported by NPD on the provided GCE instance. +func FetchNPDMetrics(ins gce.Instance) ([]metrics.Float64MetricRepresentation, error) { + var npdMetrics []metrics.Float64MetricRepresentation + var err error + + curlCmd := ins.RunCommand("curl http://localhost:20257/metrics") + if curlCmd.SSHError != nil || curlCmd.Code != 0 { + return npdMetrics, fmt.Errorf("error fetching NPD metrics: %v", curlCmd) + } + + npdMetrics, err = metrics.ParsePrometheusMetrics(curlCmd.Stdout) + if err != nil { + return npdMetrics, fmt.Errorf("error parsing NPD metrics: %v", err) + } + + return npdMetrics, nil +} + +// WaitForNPD waits for NPD to become ready by waiting for expected metrics. +func WaitForNPD(ins gce.Instance, metricNames []string, timeoutSeconds uint) error { + verifyMetricExist := func() error { + gotMetrics, err := FetchNPDMetrics(ins) + if err != nil { + return fmt.Errorf("Error fetching NPD metrics: %v", err) + } + for _, metricName := range metricNames { + _, err = metrics.GetFloat64Metric(gotMetrics, metricName, map[string]string{}, false) + if err != nil { + return fmt.Errorf("Failed to find metric %s: %v.\nHere is all NPD exported metrics: %v", + metricName, err, gotMetrics) + } + } + return nil + } + + // Wait for NPD to be ready for a maximum of 120 seconds. + return retry.Do(verifyMetricExist, + retry.Delay(10*time.Second), + retry.Attempts(timeoutSeconds/10), + retry.DelayType(retry.FixedDelay)) +} + +// SaveTestArtifacts saves debugging data from NPD. +func SaveTestArtifacts(ins gce.Instance, directory string) []error { + var errs []error + + npdMetrics := ins.RunCommand("curl http://localhost:20257/metrics") + if npdMetrics.SSHError != nil || npdMetrics.Code != 0 { + errs = append(errs, fmt.Errorf("Error fetching NPD metrics: %v\n", npdMetrics)) + } else { + npdMetricsPath := path.Join(directory, npdMetricsFilename) + err := ioutil.WriteFile(npdMetricsPath, []byte(npdMetrics.Stdout), 0644) + if err != nil { + errs = append(errs, fmt.Errorf("Error writing to %s: %v", npdMetricsPath, err)) + } + } + + npdLog := ins.RunCommand("sudo journalctl -u node-problem-detector.service") + if npdLog.SSHError != nil || npdLog.Code != 0 { + errs = append(errs, fmt.Errorf("Error fetching NPD logs: %v\n", npdLog)) + } else { + npdLogsPath := path.Join(directory, npdLogsFilename) + err := ioutil.WriteFile(npdLogsPath, []byte(npdLog.Stdout), 0644) + if err != nil { + errs = append(errs, fmt.Errorf("Error writing to %s: %v", npdLogsPath, err)) + } + } + + return errs +} diff --git a/test/e2e/lib/ssh/ssh.go b/test/e2e/lib/ssh/ssh.go new file mode 100644 index 000000000..579e6ecf6 --- /dev/null +++ b/test/e2e/lib/ssh/ssh.go @@ -0,0 +1,64 @@ +/* +Copyright 2019 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package ssh + +import ( + "fmt" + "net" + "os" + + k8s_ssh "k8s.io/kubernetes/pkg/ssh" +) + +// Result holds the execution result of SSH command. +type Result struct { + User string + Host string + Cmd string + Stdout string + Stderr string + Code int + SSHError error +} + +// Run synchronously SSHs to a machine and runs cmd. +func Run(cmd, ip, sshUser, sshKey string) Result { + result := Result{User: sshUser, Host: ip, Cmd: cmd} + + if result.User == "" { + result.User = os.Getenv("USER") + } + + if ip == "" { + result.SSHError = fmt.Errorf("empty IP address") + return result + } + + signer, err := k8s_ssh.MakePrivateKeySignerFromFile(sshKey) + if err != nil { + result.SSHError = fmt.Errorf("error getting signer from key file %s: %v", sshKey, err) + return result + } + + stdout, stderr, code, err := k8s_ssh.RunSSHCommand(cmd, result.User, net.JoinHostPort(ip, "22"), signer) + result.Stdout = stdout + result.Stderr = stderr + result.Code = code + result.SSHError = err + + return result +} diff --git a/test/e2e/metriconly/e2e_npd_test.go b/test/e2e/metriconly/e2e_npd_test.go new file mode 100644 index 000000000..d747dd42d --- /dev/null +++ b/test/e2e/metriconly/e2e_npd_test.go @@ -0,0 +1,73 @@ +/* +Copyright 2019 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e_metric_only + +import ( + "flag" + "fmt" + "os" + "path" + "testing" + + "k8s.io/node-problem-detector/test/e2e/lib/gce" + + "github.com/onsi/ginkgo" + "github.com/onsi/ginkgo/reporters" + compute "google.golang.org/api/compute/v1" +) + +const junitFileName = "junit.xml" + +var zone = flag.String("zone", "", "gce zone the hosts live in") +var project = flag.String("project", "", "gce project the hosts live in") +var image = flag.String("image", "", "image to test") +var imageProject = flag.String("image-project", "", "gce project of the OS image") +var sshKey = flag.String("ssh-key", "", "path to ssh private key.") +var sshUser = flag.String("ssh-user", "", "use predefined user for ssh.") +var npdBuildTar = flag.String("npd-build-tar", "", "tarball containing NPD to be tested.") +var artifactsDir = flag.String("artifacts-dir", "", "local directory to save test artifacts into.") + +var computeService *compute.Service + +func TestNPD(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + + if *artifactsDir != "" { + _, err := os.Stat(*artifactsDir) + if err != nil && os.IsNotExist(err) { + os.MkdirAll(*artifactsDir, os.ModeDir|0755) + } + } + + // The junit formatted result output is for showing test results on testgrid. + junitReporter := reporters.NewJUnitReporter(path.Join(*artifactsDir, junitFileName)) + ginkgo.RunSpecsWithDefaultAndCustomReporters(t, "NPD Metric-only Suite", []ginkgo.Reporter{junitReporter}) +} + +func TestMain(m *testing.M) { + flag.Parse() + + var err error + computeService, err = gce.GetComputeClient() + if err != nil { + panic(fmt.Sprintf("Unable to create gcloud compute service using defaults. Make sure you are authenticated. %v", err)) + } + + os.Exit(m.Run()) +} diff --git a/test/e2e/metriconly/metrics_test.go b/test/e2e/metriconly/metrics_test.go new file mode 100644 index 000000000..c0ddfbdb7 --- /dev/null +++ b/test/e2e/metriconly/metrics_test.go @@ -0,0 +1,108 @@ +/* +Copyright 2019 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e_metric_only + +import ( + "fmt" + "os" + "path" + "strings" + + "k8s.io/node-problem-detector/pkg/util/metrics" + "k8s.io/node-problem-detector/test/e2e/lib/gce" + "k8s.io/node-problem-detector/test/e2e/lib/npd" + + "github.com/onsi/ginkgo" + "github.com/pborman/uuid" +) + +var _ = ginkgo.Describe("NPD should export Prometheus metrics.", func() { + var instance gce.Instance + + ginkgo.BeforeEach(func() { + var err error + // TODO(xueweiz): Creating instance for each test case is slow. We should either reuse the instance + // between tests, or have a way to run these tests in parallel. + instance, err = gce.CreateInstance( + gce.Instance{ + Name: "npd-metrics-" + *image + "-" + uuid.NewUUID().String()[:8], + Zone: *zone, + Project: *project, + SshKey: *sshKey, + SshUser: *sshUser, + ComputeService: computeService, + }, + *image, + *imageProject) + if err != nil { + ginkgo.Fail(fmt.Sprintf("Unable to create test instance: %v", err)) + } + + err = npd.SetupNPD(instance, *npdBuildTar) + if err != nil { + ginkgo.Fail(fmt.Sprintf("Unable to setup NPD: %v", err)) + } + }) + + ginkgo.Context("On a clean node", func() { + + ginkgo.It("NPD should export host_uptime metric", func() { + err := npd.WaitForNPD(instance, []string{"host_uptime"}, 120) + if err != nil { + ginkgo.Fail(fmt.Sprintf("Expect NPD to become ready in 120s, but hit error: %v", err)) + } + + gotMetrics, err := npd.FetchNPDMetrics(instance) + if err != nil { + ginkgo.Fail(fmt.Sprintf("Error fetching NPD metrics: %v", err)) + } + _, err = metrics.GetFloat64Metric(gotMetrics, "host_uptime", map[string]string{}, false) + if err != nil { + ginkgo.Fail(fmt.Sprintf("Failed to find uptime metric: %v.\nHere is all NPD exported metrics: %v", + err, gotMetrics)) + } + }) + }) + + ginkgo.AfterEach(func() { + defer func() { + err := instance.DeleteInstance() + if err != nil { + ginkgo.Fail(fmt.Sprintf("Failed to clean up the test VM: %v", err)) + } + }() + + artifactSubDir := "" + if *artifactsDir != "" { + testText := ginkgo.CurrentGinkgoTestDescription().FullTestText + testSubdirName := strings.Replace(testText, " ", "_", -1) + + artifactSubDir = path.Join(*artifactsDir, testSubdirName) + err := os.MkdirAll(artifactSubDir, os.ModeDir|0644) + if err != nil { + fmt.Printf("Failed to create sub-directory to hold test artiface for test %s at %s\n", + testText, artifactSubDir) + return + } + } + + errs := npd.SaveTestArtifacts(instance, artifactSubDir) + if len(errs) != 0 { + fmt.Printf("Error storing debugging data to test artifacts: %v", errs) + } + }) +})