From 9d3db604ce2288344977c7c9e83292854ef913ea Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Thu, 24 Sep 2020 12:39:46 +0200 Subject: [PATCH 1/2] deploy: clarify OPERATOR_NAME "should" is not strong enough. It "must" match. --- deploy/kustomize/operator/operator.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deploy/kustomize/operator/operator.yaml b/deploy/kustomize/operator/operator.yaml index 323fb13e88..435fce085b 100644 --- a/deploy/kustomize/operator/operator.yaml +++ b/deploy/kustomize/operator/operator.yaml @@ -148,7 +148,8 @@ spec: fieldRef: fieldPath: metadata.name - name: OPERATOR_NAME - # This should match to the operator container name + # This must match to the operator container name. + # The operator uses it to find its image. value: "pmem-csi-operator" volumeMounts: - name: tmp From 48a0250d34f524ca57e6acc3618d6db559e10600 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Thu, 24 Sep 2020 13:14:31 +0200 Subject: [PATCH 2/2] test: catch unexpected modifications by operator The operator had a bug where its change detection caused it to update objects unnecessarily. This wasn't caught before by the tests because the modified objects still had the expected content (as far as we know, at least - https://github.com/intel/pmem-csi/issues/742 still lacks a proper explanation). Now the update unit test catches that bug: TestDeploymentController/Kubernetes_1.18/updating/pmemPercentage_in_default_deployment: deployment_controller_test.go:264: Error Trace: deployment_controller_test.go:264 deployment_controller_test.go:559 deployment_controller_test.go:584 Error: Received unexpected error: deployed driver different from expected deployment: object was modified unnecessarily: "pmem-csi-with-defaults-node" of type "apps/v1, Kind=DaemonSet" in namespace "test-namespace" object was modified unnecessarily: "pmem-csi-with-defaults-controller" of type "apps/v1, Kind=StatefulSet" in namespace "test-namespace" Test: TestDeploymentController/Kubernetes_1.18/updating/pmemPercentage_in_default_deployment Messages: validate deployment We cannot use the same validation during E2E testing because the app controllers also modify the objects by setting their status. --- .../deployment/deployment_controller_test.go | 71 ++++++++--------- test/e2e/operator/deployment_api.go | 39 +++++++--- test/e2e/operator/driver.go | 2 +- test/e2e/operator/validate/validate.go | 77 +++++++++++++++---- 4 files changed, 127 insertions(+), 62 deletions(-) diff --git a/pkg/pmem-csi-operator/controller/deployment/deployment_controller_test.go b/pkg/pmem-csi-operator/controller/deployment/deployment_controller_test.go index 702f09e4c0..f87b5cc8f8 100644 --- a/pkg/pmem-csi-operator/controller/deployment/deployment_controller_test.go +++ b/pkg/pmem-csi-operator/controller/deployment/deployment_controller_test.go @@ -185,11 +185,12 @@ func TestDeploymentController(t *testing.T) { testIt := func(t *testing.T, testK8sVersion version.Version) { type testContext struct { - c client.Client - cs kubernetes.Interface - rc reconcile.Reconciler - evWatcher watch.Interface - events []*corev1.Event + c client.Client + cs kubernetes.Interface + rc reconcile.Reconciler + evWatcher watch.Interface + events []*corev1.Event + resourceVersions map[string]string } const ( testNamespace = "test-namespace" @@ -209,9 +210,11 @@ func TestDeploymentController(t *testing.T) { } setup := func(t *testing.T) *testContext { - tc := &testContext{} - tc.c = fake.NewFakeClient() - tc.cs = cgfake.NewSimpleClientset() + tc := &testContext{ + c: fake.NewFakeClient(), + cs: cgfake.NewSimpleClientset(), + resourceVersions: map[string]string{}, + } tc.rc = newReconcileDeployment(tc.c, tc.cs) tc.evWatcher = tc.rc.(*deployment.ReconcileDeployment).EventBroadcaster().StartEventWatcher(func(ev *corev1.Event) { // Discard consecutive duplicate events, mimicking the EventAggregator behavior @@ -245,14 +248,20 @@ func TestDeploymentController(t *testing.T) { require.ElementsMatch(t, events, expectedEvents, "events must match") } - validateDriver := func(t *testing.T, tc *testContext, dep *api.Deployment, expectedEvents []string) { + validateDriver := func(t *testing.T, tc *testContext, dep *api.Deployment, expectedEvents []string, wasUpdated bool) { // We may have to fill in some defaults, so make a copy first. dep = dep.DeepCopyObject().(*api.Deployment) if dep.Spec.Image == "" { dep.Spec.Image = testDriverImage } - require.NoError(t, validate.DriverDeployment(tc.c, testK8sVersion, testNamespace, *dep), "validate deployment") + // If the CR was not updated, then objects should still be the same as they were initially. + rv := tc.resourceVersions + if wasUpdated { + rv = nil + } + _, err := validate.DriverDeployment(tc.c, testK8sVersion, testNamespace, *dep, rv) + require.NoError(t, err, "validate deployment") validateEvents(t, tc, dep, expectedEvents) } @@ -271,7 +280,7 @@ func TestDeploymentController(t *testing.T) { require.NoError(t, err, "failed to create deployment") testReconcilePhase(t, tc.rc, tc.c, d.name, false, false, api.DeploymentPhaseRunning) - validateDriver(t, tc, dep, []string{api.EventReasonNew, api.EventReasonRunning}) + validateDriver(t, tc, dep, []string{api.EventReasonNew, api.EventReasonRunning}, false) }) t.Run("deployment with explicit values", func(t *testing.T) { @@ -297,7 +306,7 @@ func TestDeploymentController(t *testing.T) { // Reconcile now should change Phase to running testReconcilePhase(t, tc.rc, tc.c, d.name, false, false, api.DeploymentPhaseRunning) - validateDriver(t, tc, dep, []string{api.EventReasonNew, api.EventReasonRunning}) + validateDriver(t, tc, dep, []string{api.EventReasonNew, api.EventReasonRunning}, false) }) t.Run("multiple deployments", func(t *testing.T) { @@ -320,9 +329,9 @@ func TestDeploymentController(t *testing.T) { require.NoError(t, err, "failed to create deployment2") testReconcilePhase(t, tc.rc, tc.c, d1.name, false, false, api.DeploymentPhaseRunning) - validateDriver(t, tc, dep1, []string{api.EventReasonNew, api.EventReasonRunning}) + validateDriver(t, tc, dep1, []string{api.EventReasonNew, api.EventReasonRunning}, false) testReconcilePhase(t, tc.rc, tc.c, d2.name, false, false, api.DeploymentPhaseRunning) - validateDriver(t, tc, dep2, []string{api.EventReasonNew, api.EventReasonRunning}) + validateDriver(t, tc, dep2, []string{api.EventReasonNew, api.EventReasonRunning}, false) }) t.Run("invalid device mode", func(t *testing.T) { @@ -354,7 +363,7 @@ func TestDeploymentController(t *testing.T) { err := tc.c.Create(context.TODO(), dep) require.NoError(t, err, "failed to create deployment") testReconcilePhase(t, tc.rc, tc.c, d.name, false, false, api.DeploymentPhaseRunning) - validateDriver(t, tc, dep, []string{api.EventReasonNew, api.EventReasonRunning}) + validateDriver(t, tc, dep, []string{api.EventReasonNew, api.EventReasonRunning}, false) }) t.Run("direct mode", func(t *testing.T) { @@ -369,7 +378,7 @@ func TestDeploymentController(t *testing.T) { err := tc.c.Create(context.TODO(), dep) require.NoError(t, err, "failed to create deployment") testReconcilePhase(t, tc.rc, tc.c, d.name, false, false, api.DeploymentPhaseRunning) - validateDriver(t, tc, dep, []string{api.EventReasonNew, api.EventReasonRunning}) + validateDriver(t, tc, dep, []string{api.EventReasonNew, api.EventReasonRunning}, false) }) t.Run("provided private keys", func(t *testing.T) { @@ -391,7 +400,7 @@ func TestDeploymentController(t *testing.T) { // First deployment expected to be successful testReconcilePhase(t, tc.rc, tc.c, d.name, false, false, api.DeploymentPhaseRunning) - validateDriver(t, tc, dep, []string{api.EventReasonNew, api.EventReasonRunning}) + validateDriver(t, tc, dep, []string{api.EventReasonNew, api.EventReasonRunning}, false) }) t.Run("provided private keys and certificates", func(t *testing.T) { @@ -424,7 +433,7 @@ func TestDeploymentController(t *testing.T) { // First deployment expected to be successful testReconcilePhase(t, tc.rc, tc.c, d.name, false, false, api.DeploymentPhaseRunning) - validateDriver(t, tc, dep, []string{api.EventReasonNew, api.EventReasonRunning}) + validateDriver(t, tc, dep, []string{api.EventReasonNew, api.EventReasonRunning}, false) }) t.Run("invalid private keys and certificates", func(t *testing.T) { @@ -517,13 +526,13 @@ func TestDeploymentController(t *testing.T) { tc.rc.(*deployment.ReconcileDeployment).AddHook(&hook) testReconcilePhase(t, tc.rc, tc.c, d.name, false, false, api.DeploymentPhaseRunning) - validateDriver(t, tc, dep, []string{api.EventReasonNew, api.EventReasonRunning}) + validateDriver(t, tc, dep, []string{api.EventReasonNew, api.EventReasonRunning}, false) tc.rc.(*deployment.ReconcileDeployment).RemoveHook(&hook) // Next reconcile phase should catch the deployment changes testReconcilePhase(t, tc.rc, tc.c, d.name, false, false, api.DeploymentPhaseRunning) - validateDriver(t, tc, updatedDep, []string{api.EventReasonNew, api.EventReasonRunning}) + validateDriver(t, tc, updatedDep, []string{api.EventReasonNew, api.EventReasonRunning}, true) }) t.Run("updating", func(t *testing.T) { @@ -543,10 +552,11 @@ func TestDeploymentController(t *testing.T) { require.NoError(t, err, "create deployment") testReconcilePhase(t, tc.rc, tc.c, dep.Name, false, false, api.DeploymentPhaseRunning) - validateDriver(t, tc, dep, []string{api.EventReasonNew, api.EventReasonRunning}) + validateDriver(t, tc, dep, []string{api.EventReasonNew, api.EventReasonRunning}, false) // Reconcile now should keep phase as running. testReconcilePhase(t, tc.rc, tc.c, dep.Name, false, false, api.DeploymentPhaseRunning) + validateDriver(t, tc, dep, []string{api.EventReasonNew, api.EventReasonRunning}, false) validateEvents(t, tc, dep, []string{api.EventReasonNew, api.EventReasonRunning}) // Retrieve existing object before updating it. @@ -567,7 +577,7 @@ func TestDeploymentController(t *testing.T) { testReconcilePhase(t, tc.rc, tc.c, dep.Name, false, false, api.DeploymentPhaseRunning) // Recheck the container resources are updated - validateDriver(t, tc, dep, []string{api.EventReasonNew, api.EventReasonRunning}) + validateDriver(t, tc, dep, []string{api.EventReasonNew, api.EventReasonRunning}, true) } t.Run("while running", func(t *testing.T) { @@ -593,7 +603,7 @@ func TestDeploymentController(t *testing.T) { err := tc.c.Create(context.TODO(), dep) require.NoError(t, err, "failed to create deployment") testReconcilePhase(t, tc.rc, tc.c, d.name, false, false, api.DeploymentPhaseRunning) - validateDriver(t, tc, dep, []string{api.EventReasonNew, api.EventReasonRunning}) + validateDriver(t, tc, dep, []string{api.EventReasonNew, api.EventReasonRunning}, false) err = tc.c.Get(context.TODO(), client.ObjectKey{Name: d.name}, dep) require.NoError(t, err, "get deployment") @@ -629,17 +639,6 @@ func TestDeploymentController(t *testing.T) { err = tc.c.Create(context.TODO(), cm2) require.NoError(t, err, "create configmap: %s", cm2.Name) - defer func() { - err := tc.c.Delete(context.TODO(), cm1) - if err != nil && !errors.IsNotFound(err) { - require.NoErrorf(t, err, "delete configmap: %s", cm1.Name) - } - err = tc.c.Delete(context.TODO(), cm2) - if err != nil && !errors.IsNotFound(err) { - require.NoErrorf(t, err, "delete configmap: %s", cm2.Name) - } - }() - // Use a fresh reconciler to mimic operator restart tc.rc = newReconcileDeployment(tc.c, tc.cs) @@ -647,7 +646,9 @@ func TestDeploymentController(t *testing.T) { testReconcilePhase(t, tc.rc, tc.c, d.name, false, false, api.DeploymentPhaseRunning) err = tc.c.Get(context.TODO(), client.ObjectKey{Name: d.name}, dep) require.NoError(t, err, "get deployment") - validateDriver(t, tc, dep, []string{api.EventReasonNew, api.EventReasonRunning}) + // It is debatable whether the operator should update all objects after + // a restart. Currently it does. + validateDriver(t, tc, dep, []string{api.EventReasonNew, api.EventReasonRunning}, true) cm := &corev1.ConfigMap{} err = tc.c.Get(context.TODO(), client.ObjectKey{Name: cm1.Name, Namespace: testNamespace}, cm) diff --git a/test/e2e/operator/deployment_api.go b/test/e2e/operator/deployment_api.go index 8c0d55fd92..a916055d55 100644 --- a/test/e2e/operator/deployment_api.go +++ b/test/e2e/operator/deployment_api.go @@ -95,7 +95,15 @@ var _ = deploy.DescribeForSome("API", func(d *deploy.Deployment) bool { if what == nil { what = []interface{}{"validate driver"} } - framework.ExpectNoErrorWithOffset(1, validate.DriverDeploymentEventually(ctx, client, k8sver, d.Namespace, deployment), what...) + + // We cannot check for unexpected object modifications + // by the operator during E2E testing because the app + // controllers themselves will also modify the same + // objects with status changes. We can only test + // that during unit testing. + initialCreation := false + + framework.ExpectNoErrorWithOffset(1, validate.DriverDeploymentEventually(ctx, client, k8sver, d.Namespace, deployment, initialCreation), what...) framework.Logf("got expected driver deployment %s", deployment.Name) } @@ -197,7 +205,7 @@ var _ = deploy.DescribeForSome("API", func(d *deploy.Deployment) bool { deployment2 = deploy.CreateDeploymentCR(f, deployment2) defer deploy.DeleteDeploymentCR(f, deployment2.Name) - validateDriver(deployment1 /* TODO 2 */, "validate driver #2") + validateDriver(deployment1, true /* TODO 2 */, "validate driver #2") }) It("shall support dots in the name", func() { @@ -214,7 +222,7 @@ var _ = deploy.DescribeForSome("API", func(d *deploy.Deployment) bool { deployment = deploy.CreateDeploymentCR(f, deployment) defer deploy.DeleteDeploymentCR(f, deployment.Name) - validateDriver(deployment) + validateDriver(deployment, true) }) It("driver deployment shall be running even after operator exit", func() { @@ -223,7 +231,7 @@ var _ = deploy.DescribeForSome("API", func(d *deploy.Deployment) bool { deployment = deploy.CreateDeploymentCR(f, deployment) defer deploy.DeleteDeploymentCR(f, deployment.Name) - validateDriver(deployment) + validateDriver(deployment, true) // Stop the operator stopOperator(c, d) @@ -231,8 +239,13 @@ var _ = deploy.DescribeForSome("API", func(d *deploy.Deployment) bool { defer startOperator(c, d) // Ensure that the driver is running consistently + resourceVersions := map[string]string{} Consistently(func() error { - return validate.DriverDeployment(client, k8sver, d.Namespace, deployment) + final, err := validate.DriverDeployment(client, k8sver, d.Namespace, deployment, resourceVersions) + if final { + framework.Failf("final error during driver validation after restarting: %v", err) + } + return err }, "1m", "20s").ShouldNot(HaveOccurred(), "driver validation failure after restarting") }) @@ -283,7 +296,7 @@ var _ = deploy.DescribeForSome("API", func(d *deploy.Deployment) bool { // Deleting the existing secret should make the deployment succeed. deleteSecret(sec.Name) - validateDriver(deployment) + validateDriver(deployment, true) }) }) @@ -378,7 +391,7 @@ var _ = deploy.DescribeForSome("API", func(d *deploy.Deployment) bool { deployment = deploy.CreateDeploymentCR(f, deployment) defer deploy.DeleteDeploymentCR(f, deployment.Name) deploy.WaitForPMEMDriver(c, deployment.Name, corev1.NamespaceDefault, false /* testing */) - validateDriver(deployment) + validateDriver(deployment, true) // NOTE(avalluri): As the current operator does not support deploying // the driver in 'testing' mode, we cannot directely access CSI @@ -434,8 +447,10 @@ var _ = deploy.DescribeForSome("API", func(d *deploy.Deployment) bool { validateDriver(deployment, "validate driver before update") // We have to get a fresh copy before updating it because the - // operator should have modified the status. - deployment = deploy.GetDeploymentCR(f, deployment.Name) + // operator should have modified the status, and only the status. + modifiedDeployment := deploy.GetDeploymentCR(f, deployment.Name) + Expect(modifiedDeployment.Spec).To(Equal(deployment.Spec), "spec unmodified") + Expect(modifiedDeployment.Status.Phase).To(Equal(api.DeploymentPhaseRunning), "deployment phase") restored := false if restart { @@ -447,15 +462,15 @@ var _ = deploy.DescribeForSome("API", func(d *deploy.Deployment) bool { }() } - testcase.Mutate(&deployment) - deployment = deploy.UpdateDeploymentCR(f, deployment) + testcase.Mutate(&modifiedDeployment) + deployment = deploy.UpdateDeploymentCR(f, modifiedDeployment) if restart { startOperator(c, d) restored = true } - validateDriver(deployment, "validate driver after update and restart") + validateDriver(modifiedDeployment, "validate driver after update and restart") } It("while running", func() { diff --git a/test/e2e/operator/driver.go b/test/e2e/operator/driver.go index 1d6077c9cb..76399bfc4e 100644 --- a/test/e2e/operator/driver.go +++ b/test/e2e/operator/driver.go @@ -49,7 +49,7 @@ var _ = deploy.DescribeForSome("driver", func(d *deploy.Deployment) bool { deployment := d.GetDriverDeployment() deployment = deploy.GetDeploymentCR(f, deployment.Name) - framework.ExpectNoError(validate.DriverDeploymentEventually(ctx, client, *k8sver, d.Namespace, deployment), "validate driver") + framework.ExpectNoError(validate.DriverDeploymentEventually(ctx, client, *k8sver, d.Namespace, deployment, true), "validate driver") }) // Just very minimal testing at the moment. diff --git a/test/e2e/operator/validate/validate.go b/test/e2e/operator/validate/validate.go index 4442fce6c2..9b929fe575 100644 --- a/test/e2e/operator/validate/validate.go +++ b/test/e2e/operator/validate/validate.go @@ -36,7 +36,7 @@ import ( // objects for a certain deployment spec. deploymentSpec should only have those fields // set which are not the defaults. This call will wait for the expected objects until // the context times out. -func DriverDeploymentEventually(ctx context.Context, client client.Client, k8sver version.Version, namespace string, deployment api.Deployment) error { +func DriverDeploymentEventually(ctx context.Context, client client.Client, k8sver version.Version, namespace string, deployment api.Deployment, initialCreation bool) error { ticker := time.NewTicker(1 * time.Second) defer ticker.Stop() @@ -44,37 +44,71 @@ func DriverDeploymentEventually(ctx context.Context, client client.Client, k8sve return errors.New("deployment not an object that was stored in the API server, no UID") } + // Track resource versions to detect modifications? + var resourceVersions map[string]string + if initialCreation { + resourceVersions = map[string]string{} + } + // TODO: once there is a better way to detect that the // operator has finished updating the deployment, check for // that here instead of repeatedly checking the objects. // As it stands now, permanent differences will only be // reported when the test times out. - ready := func() (err error) { - return DriverDeployment(client, k8sver, namespace, deployment) + ready := func() (final bool, err error) { + return DriverDeployment(client, k8sver, namespace, deployment, resourceVersions) } - if err := ready(); err != nil { + if final, err := ready(); err != nil { + if final { + return err + } + loop: for { select { case <-ticker.C: - err = ready() + final, err = ready() if err == nil { - return nil + break loop + } + if final { + return err } case <-ctx.Done(): return fmt.Errorf("timed out waiting for deployment, last error: %v", err) } } } - return nil + + // Now wait a bit longer to see whether the objects change again - they shouldn't. + // The longer we wait, the more certainty we have that we have reached a stable state. + deadline, cancel := context.WithTimeout(ctx, 10*time.Second) + defer cancel() + for { + select { + case <-ticker.C: + if _, err := ready(); err != nil { + return fmt.Errorf("objects were changed after reaching expected state: %v", err) + } + case <-deadline.Done(): + return nil + } + } } // DriverDeployment compares all objects as deployed by the operator against the expected // objects for a certain deployment spec. deploymentSpec should only have those fields // set which are not the defaults. The caller must ensure that the operator is done // with creating objects. -func DriverDeployment(client client.Client, k8sver version.Version, namespace string, deployment api.Deployment) error { +// +// resourceVersions is used to track which resource versions were encountered +// for generated objects. If not nil, the version must not change (i.e. the operator +// must not update the objects after creating them). +// +// A final error is returned when observing a problem that is not going to go away, +// like an unexpected update of an object. +func DriverDeployment(client client.Client, k8sver version.Version, namespace string, deployment api.Deployment, resourceVersions map[string]string) (final bool, finalErr error) { if deployment.GetUID() == "" { - return errors.New("deployment not an object that was stored in the API server, no UID") + return true, errors.New("deployment not an object that was stored in the API server, no UID") } // The operator currently always uses the production image. We @@ -85,16 +119,32 @@ func DriverDeployment(client client.Client, k8sver version.Version, namespace st // Validate sub-objects. A sub-object is anything that has the deployment object as owner. objects, err := listAllDeployedObjects(client, deployment) if err != nil { - return err + return false, err } expectedObjects, err := deployments.LoadAndCustomizeObjects(k8sver, deployment.Spec.DeviceMode, namespace, deployment) if err != nil { - return fmt.Errorf("customize expected objects: %v", err) + return true, fmt.Errorf("customize expected objects: %v", err) } var diffs []string for _, actual := range objects { + // When the CR just got created, the operator should + // immediately create objects with the right content and then + // not update them again. + if resourceVersions != nil { + uid := string(actual.GetUID()) + currentResourceVersion := actual.GetResourceVersion() + oldResourceVersion, ok := resourceVersions[uid] + if ok { + if oldResourceVersion != currentResourceVersion { + diffs = append(diffs, fmt.Sprintf("object was modified unnecessarily: %s", prettyPrintObjectID(actual))) + final = true + } + } else { + resourceVersions[uid] = currentResourceVersion + } + } expected := findObject(expectedObjects, actual) if expected == nil { if actual.GetKind() == "Secret" { @@ -164,10 +214,9 @@ func DriverDeployment(client client.Client, k8sver version.Version, namespace st } } if diffs != nil { - return fmt.Errorf("deployed driver different from expected deployment:\n%s", strings.Join(diffs, "\n")) + finalErr = fmt.Errorf("deployed driver different from expected deployment:\n%s", strings.Join(diffs, "\n")) } - - return nil + return } func createObject(gvk schema.GroupVersionKind, name, namespace string) unstructured.Unstructured {