diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml index 8648934e..49f2b1ad 100644 --- a/.github/workflows/e2e.yml +++ b/.github/workflows/e2e.yml @@ -43,6 +43,7 @@ jobs: kubectl label node kind-worker node-role.kubernetes.io/worker='' hack/wait-nodes-ready.sh kubectl describe nodes + uname -a - name: run E2E tests run: | @@ -79,6 +80,7 @@ jobs: kubectl label node kind-worker node-role.kubernetes.io/worker='' hack/wait-nodes-ready.sh kubectl describe nodes + uname -a - name: run E2E tests run: | @@ -117,6 +119,7 @@ jobs: kubectl label node kind-worker node-role.kubernetes.io/worker='' hack/wait-nodes-ready.sh kubectl describe nodes + uname -a - name: run E2E tests run: | diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index 150be42d..307c5845 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -24,17 +24,22 @@ import ( "runtime" "strconv" "strings" + "sync" "testing" "time" "github.com/go-logr/logr" "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" + "sigs.k8s.io/controller-runtime/pkg/client" ctrllog "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/yaml" + appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/wait" + k8swait "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/klog/v2" "k8s.io/klog/v2/klogr" "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/apis/topology/v1alpha2" @@ -44,9 +49,17 @@ import ( "github.com/k8stopologyawareschedwg/podfingerprint" "github.com/k8stopologyawareschedwg/deployer/pkg/clientutil" + "github.com/k8stopologyawareschedwg/deployer/pkg/clientutil/nodes" "github.com/k8stopologyawareschedwg/deployer/pkg/deployer" + "github.com/k8stopologyawareschedwg/deployer/pkg/deployer/platform" + "github.com/k8stopologyawareschedwg/deployer/pkg/deployer/wait" + "github.com/k8stopologyawareschedwg/deployer/pkg/manifests" + "github.com/k8stopologyawareschedwg/deployer/pkg/manifests/rte" + "github.com/k8stopologyawareschedwg/deployer/pkg/manifests/sched" + "github.com/k8stopologyawareschedwg/deployer/pkg/options" "github.com/k8stopologyawareschedwg/deployer/pkg/stringify" "github.com/k8stopologyawareschedwg/deployer/pkg/validator" + e2epods "github.com/k8stopologyawareschedwg/deployer/test/e2e/utils/pods" ) var ( @@ -132,7 +145,7 @@ func getNodeResourceTopology(tc topologyclientset.Interface, name string, filter var err error var nrt *v1alpha2.NodeResourceTopology fmt.Fprintf(ginkgo.GinkgoWriter, "looking for noderesourcetopology %q\n", name) - err = wait.PollImmediate(5*time.Second, 1*time.Minute, func() (bool, error) { + err = k8swait.PollImmediate(5*time.Second, 1*time.Minute, func() (bool, error) { nrt, err = tc.TopologyV1alpha2().NodeResourceTopologies().Get(context.TODO(), name, metav1.GetOptions{}) if err != nil { return false, err @@ -190,7 +203,11 @@ func deployWithManifests() error { // TODO: use error wrapping err := runCmdline(cmdline, "failed to deploy components before test started") if err != nil { - dumpSchedulerPods() + cli, cerr := clientutil.New() + if cerr == nil { + // don't hide the previous error + dumpSchedulerPods(context.Background(), cli) + } // else? } return err } @@ -201,7 +218,9 @@ func deploy(updaterType string, pfpEnable bool) error { "deploy", "--rte-config-file=" + filepath.Join(deployerBaseDir, "hack", "rte.yaml"), "--updater-pfp-enable=" + strconv.FormatBool(pfpEnable), + "--updater-verbose=5", "--sched-ctrlplane-affinity=false", + "--sched-verbose=5", "--wait", } if updaterType != "" { @@ -211,7 +230,11 @@ func deploy(updaterType string, pfpEnable bool) error { // TODO: use error wrapping err := runCmdline(cmdline, "failed to deploy components before test started") if err != nil { - dumpSchedulerPods() + cli, cerr := clientutil.New() + if cerr == nil { + // don't hide the previous error + dumpSchedulerPods(context.Background(), cli) + } // else? } return err } @@ -244,12 +267,204 @@ func runCmdline(cmdline []string, errMsg string) error { } func NullEnv() *deployer.Environment { + ginkgo.GinkgoHelper() cli, err := clientutil.New() - gomega.ExpectWithOffset(1, err).ToNot(gomega.HaveOccurred()) - env := deployer.Environment{ + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + return WithEnv(cli) +} + +func WithEnv(cli client.Client) *deployer.Environment { + return &deployer.Environment{ Ctx: context.TODO(), Cli: cli, Log: logr.Discard(), } - return &env +} + +func dumpSchedulerPods(ctx context.Context, cli client.Client) { + ginkgo.GinkgoHelper() + + ns, err := manifests.Namespace(manifests.ComponentSchedulerPlugin) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + + // TODO: autodetect the platform + mfs, err := sched.GetManifests(platform.Kubernetes, ns.Name) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + mfs, err = mfs.Render(logr.Discard(), options.Scheduler{ + Replicas: int32(1), + }) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + + k8sCli, err := clientutil.NewK8s() + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + + pods, err := e2epods.GetByDeployment(cli, ctx, *mfs.DPScheduler) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + + klog.Warning(">>> scheduler pod status begin:\n") + for idx := range pods { + pod := pods[idx].DeepCopy() + pod.ManagedFields = nil + + data, err := yaml.Marshal(pod) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + + klog.Warningf("%s\n---\n", string(data)) + + e2epods.LogEventsForPod(k8sCli, ctx, pod.Namespace, pod.Name) + klog.Warningf("---\n") + } + + var cm corev1.ConfigMap + key := client.ObjectKey{ + Namespace: "tas-scheduler", + Name: "scheduler-config", + } + err = cli.Get(ctx, key, &cm) + if err == nil { + // skip errors until we can autodetect the CM key + klog.Infof("scheduler config:\n%s", cm.Data["scheduler-config.yaml"]) + } + + klog.Warning(">>> scheduler pod status end\n") +} + +func dumpWorkloadPods(ctx context.Context, pod *corev1.Pod) { + ginkgo.GinkgoHelper() + + pod = pod.DeepCopy() + + k8sCli, err := clientutil.NewK8s() + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + + klog.Warning(">>> workload pod status begin:\n") + pod.ManagedFields = nil + + data, err := yaml.Marshal(pod) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + + klog.Warningf("%s\n---\n", string(data)) + + e2epods.LogEventsForPod(k8sCli, ctx, pod.Namespace, pod.Name) + klog.Warningf("---\n") + klog.Warning(">>> workload pod status end\n") +} + +func dumpResourceTopologyExporterPods(ctx context.Context, cli client.Client) { + ginkgo.GinkgoHelper() + + ns, err := manifests.Namespace(manifests.ComponentResourceTopologyExporter) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + + // TODO: autodetect the platform + mfs, err := rte.GetManifests(platform.Kubernetes, platform.Version("1.23"), ns.Name, true) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + mfs, err = mfs.Render(options.UpdaterDaemon{ + Namespace: ns.Name, + }) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + + k8sCli, err := clientutil.NewK8s() + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + + pods, err := e2epods.GetByDaemonSet(cli, ctx, *mfs.DaemonSet) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + + klog.Warning(">>> RTE pod status begin:\n") + if len(pods) > 1 { + klog.Warningf("UNEXPECTED POD COUNT %d: dumping only the first", len(pods)) + } + if len(pods) > 0 { + pod := pods[0].DeepCopy() + pod.ManagedFields = nil + + logs, err := e2epods.GetLogsForPod(k8sCli, pod.Namespace, pod.Name, pod.Spec.Containers[0].Name) + if err == nil { + // skip errors until we can autodetect the CM key + klog.Infof(">>> RTE logs begin:\n%s\n>>> RTE logs end", logs) + } + } + + klog.Warning(">>> RTE pod status end\n") +} + +func expectSchedulerRunning(ctx context.Context, cli client.Client) { + ginkgo.GinkgoHelper() + + ns, err := manifests.Namespace(manifests.ComponentSchedulerPlugin) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + + ginkgo.By("checking that scheduler plugin is configured") + + confMap := corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns.Name, + Name: "scheduler-config", // TODO: duplicate from YAML + }, + } + err = cli.Get(ctx, client.ObjectKeyFromObject(&confMap), &confMap) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + gomega.Expect(confMap.Data).ToNot(gomega.BeNil(), "empty config map for scheduler config") + + data, ok := confMap.Data[manifests.SchedulerConfigFileName] + gomega.Expect(ok).To(gomega.BeTrue(), "empty config data for %q", manifests.SchedulerConfigFileName) + + allParams, err := manifests.DecodeSchedulerProfilesFromData([]byte(data)) + gomega.Expect(len(allParams)).To(gomega.Equal(1), "unexpected params: %#v", allParams) + + params := allParams[0] // TODO: smarter find + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + gomega.Expect(params.Cache).ToNot(gomega.BeNil(), "no data for scheduler cache config") + gomega.Expect(params.Cache.ResyncPeriodSeconds).ToNot(gomega.BeNil(), "no data for scheduler cache resync period") + + ginkgo.By("checking that scheduler plugin is running") + + ginkgo.By("checking that topo-aware-scheduler pod is running") + // TODO: autodetect the platform + mfs, err := sched.GetManifests(platform.Kubernetes, ns.Name) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + mfs, err = mfs.Render(logr.Discard(), options.Scheduler{ + Replicas: int32(1), + }) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + + var wg sync.WaitGroup + for _, dp := range []*appsv1.Deployment{ + mfs.DPScheduler, + mfs.DPController, + } { + wg.Add(1) + go func(dp *appsv1.Deployment) { + defer ginkgo.GinkgoRecover() + defer wg.Done() + _, err = wait.With(cli, logr.Discard()).Interval(10*time.Second).Timeout(3*time.Minute).ForDeploymentComplete(ctx, dp) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + }(dp) + } + wg.Wait() +} + +func expectNodeResourceTopologyData() { + ginkgo.GinkgoHelper() + + tc, err := clientutil.NewTopologyClient() + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + + workers, err := nodes.GetWorkers(NullEnv()) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + + for _, node := range workers { + ginkgo.By(fmt.Sprintf("checking node resource topology for %q", node.Name)) + + // the name of the nrt object is the same as the worker node's name + _ = getNodeResourceTopology(tc, node.Name, func(nrt *v1alpha2.NodeResourceTopology) error { + if err := checkHasCPU(nrt); err != nil { + return err + } + if err := checkHasPFP(nrt); err != nil { + return err + } + return nil + }) + } } diff --git a/test/e2e/manifests.go b/test/e2e/manifests.go index db80fdd1..b022cf65 100644 --- a/test/e2e/manifests.go +++ b/test/e2e/manifests.go @@ -29,10 +29,7 @@ import ( corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/apis/topology/v1alpha2" - "github.com/k8stopologyawareschedwg/deployer/pkg/clientutil" - "github.com/k8stopologyawareschedwg/deployer/pkg/clientutil/nodes" "github.com/k8stopologyawareschedwg/deployer/pkg/deployer/platform" "github.com/k8stopologyawareschedwg/deployer/pkg/manifests" "github.com/k8stopologyawareschedwg/deployer/pkg/manifests/rte" @@ -81,25 +78,7 @@ var _ = ginkgo.Describe("[ManifestFlow] Deployer rendering", func() { e2epods.WaitPodsToBeRunningByRegex(fmt.Sprintf("%s-*", mfs.DPScheduler.Name)) ginkgo.By("checking that noderesourcetopolgy has some information in it") - tc, err := clientutil.NewTopologyClient() - gomega.Expect(err).ToNot(gomega.HaveOccurred()) - - workers, err := nodes.GetWorkers(NullEnv()) - gomega.Expect(err).ToNot(gomega.HaveOccurred()) - for _, node := range workers { - ginkgo.By(fmt.Sprintf("checking node resource topology for %q", node.Name)) - - // the name of the nrt object is the same as the worker node's name - _ = getNodeResourceTopology(tc, node.Name, func(nrt *v1alpha2.NodeResourceTopology) error { - if err := checkHasCPU(nrt); err != nil { - return err - } - if err := checkHasPFP(nrt); err != nil { - return err - } - return nil - }) - } + expectNodeResourceTopologyData() }) ginkgo.It("should verify a test pod scheduled with the topology aware scheduler goes running", func() { @@ -122,6 +101,8 @@ var _ = ginkgo.Describe("[ManifestFlow] Deployer rendering", func() { ginkgo.Skip("skipping the pod check - not enough resources") } + expectNodeResourceTopologyData() + testNs := &corev1.Namespace{ ObjectMeta: metav1.ObjectMeta{ GenerateName: "tas-test-", @@ -141,7 +122,17 @@ var _ = ginkgo.Describe("[ManifestFlow] Deployer rendering", func() { gomega.Expect(err).ToNot(gomega.HaveOccurred()) ginkgo.By("checking the pod goes running") - e2epods.WaitForPodToBeRunning(cli, testPod.Namespace, testPod.Name, 2*time.Minute) + updatedPod, err := e2epods.WaitForPodToBeRunning(context.TODO(), cli, testPod.Namespace, testPod.Name, 3*time.Minute) + if err != nil { + ctx := context.Background() + cli, cerr := clientutil.New() + if cerr != nil { + dumpResourceTopologyExporterPods(ctx, cli) + dumpSchedulerPods(ctx, cli) + dumpWorkloadPods(ctx, updatedPod) + } + } + gomega.Expect(err).ToNot(gomega.HaveOccurred()) }) }) }) diff --git a/test/e2e/negative.go b/test/e2e/negative.go index 5ef34aa9..ab167402 100644 --- a/test/e2e/negative.go +++ b/test/e2e/negative.go @@ -161,18 +161,7 @@ var _ = ginkgo.Describe("[NegativeFlow] Deployer execution with PFP disabled", f e2epods.WaitPodsToBeRunningByRegex(fmt.Sprintf("%s-*", mfs.DPScheduler.Name)) ginkgo.By("checking that noderesourcetopolgy has some information in it") - tc, err := clientutil.NewTopologyClient() - gomega.Expect(err).ToNot(gomega.HaveOccurred()) - - workers, err := nodes.GetWorkers(NullEnv()) - gomega.Expect(err).ToNot(gomega.HaveOccurred()) - for _, node := range workers { - ginkgo.By(fmt.Sprintf("checking node resource topology for %q", node.Name)) - - // the name of the nrt object is the same as the worker node's name - err = ensureNodeResourceTopology(tc, node.Name, checkLacksPFP) - gomega.Expect(err).ToNot(gomega.HaveOccurred()) - } + expectNodeResourceTopologyData() }) }) }) diff --git a/test/e2e/positive.go b/test/e2e/positive.go index 7e68ce61..0bc34acb 100644 --- a/test/e2e/positive.go +++ b/test/e2e/positive.go @@ -24,7 +24,6 @@ import ( "path/filepath" "strconv" "strings" - "sync" "time" "github.com/go-logr/logr" @@ -33,22 +32,13 @@ import ( "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/yaml" - - appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/klog/v2" - - "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/apis/topology/v1alpha2" "github.com/k8stopologyawareschedwg/deployer/pkg/clientutil" - "github.com/k8stopologyawareschedwg/deployer/pkg/clientutil/nodes" "github.com/k8stopologyawareschedwg/deployer/pkg/deployer/platform" "github.com/k8stopologyawareschedwg/deployer/pkg/deployer/platform/detect" "github.com/k8stopologyawareschedwg/deployer/pkg/deployer/updaters" - "github.com/k8stopologyawareschedwg/deployer/pkg/deployer/wait" "github.com/k8stopologyawareschedwg/deployer/pkg/manifests" "github.com/k8stopologyawareschedwg/deployer/pkg/manifests/nfd" "github.com/k8stopologyawareschedwg/deployer/pkg/manifests/rte" @@ -272,25 +262,7 @@ var _ = ginkgo.Describe("[PositiveFlow] Deployer execution", func() { e2epods.WaitPodsToBeRunningByRegex(fmt.Sprintf("%s-*", mfs.DPScheduler.Name)) ginkgo.By("checking that noderesourcetopolgy has some information in it") - tc, err := clientutil.NewTopologyClient() - gomega.Expect(err).ToNot(gomega.HaveOccurred()) - - workers, err := nodes.GetWorkers(NullEnv()) - gomega.Expect(err).ToNot(gomega.HaveOccurred()) - for _, node := range workers { - ginkgo.By(fmt.Sprintf("checking node resource topology for %q", node.Name)) - - // the name of the nrt object is the same as the worker node's name - _ = getNodeResourceTopology(tc, node.Name, func(nrt *v1alpha2.NodeResourceTopology) error { - if err := checkHasCPU(nrt); err != nil { - return err - } - if err := checkHasPFP(nrt); err != nil { - return err - } - return nil - }) - } + expectNodeResourceTopologyData() }) ginkgo.It("should verify a test pod scheduled with the topology aware scheduler goes running", func() { @@ -305,6 +277,8 @@ var _ = ginkgo.Describe("[PositiveFlow] Deployer execution", func() { ginkgo.Fail("no worker nodes found in the cluster") } + expectNodeResourceTopologyData() + // min 1 reserved + min 1 allocatable = 2 nodes, err := e2enodes.FilterNodesWithEnoughCores(workerNodes, "2") gomega.Expect(err).ToNot(gomega.HaveOccurred()) @@ -332,7 +306,17 @@ var _ = ginkgo.Describe("[PositiveFlow] Deployer execution", func() { gomega.Expect(err).ToNot(gomega.HaveOccurred()) ginkgo.By("checking the pod goes running") - e2epods.WaitForPodToBeRunning(cli, testPod.Namespace, testPod.Name, 2*time.Minute) + updatedPod, err := e2epods.WaitForPodToBeRunning(context.TODO(), cli, testPod.Namespace, testPod.Name, 3*time.Minute) + if err != nil { + ctx := context.Background() + cli, cerr := clientutil.New() + if cerr != nil { + dumpResourceTopologyExporterPods(ctx, cli) + dumpSchedulerPods(ctx, cli) + dumpWorkloadPods(ctx, updatedPod) + } + } + gomega.Expect(err).ToNot(gomega.HaveOccurred()) }) }) @@ -367,25 +351,7 @@ var _ = ginkgo.Describe("[PositiveFlow] Deployer execution", func() { e2epods.WaitPodsToBeRunningByRegex(fmt.Sprintf("%s-*", mfs.DPScheduler.Name)) ginkgo.By("checking that noderesourcetopolgy has some information in it") - tc, err := clientutil.NewTopologyClient() - gomega.Expect(err).ToNot(gomega.HaveOccurred()) - - workers, err := nodes.GetWorkers(NullEnv()) - gomega.Expect(err).ToNot(gomega.HaveOccurred()) - for _, node := range workers { - ginkgo.By(fmt.Sprintf("checking node resource topology for %q", node.Name)) - - // the name of the nrt object is the same as the worker node's name - _ = getNodeResourceTopology(tc, node.Name, func(nrt *v1alpha2.NodeResourceTopology) error { - if err := checkHasCPU(nrt); err != nil { - return err - } - if err := checkHasPFP(nrt); err != nil { - return err - } - return nil - }) - } + expectNodeResourceTopologyData() }) ginkgo.It("should verify a test pod scheduled with the topology aware scheduler goes running", func() { @@ -408,6 +374,8 @@ var _ = ginkgo.Describe("[PositiveFlow] Deployer execution", func() { ginkgo.Skip("skipping the pod check - not enough resources") } + expectNodeResourceTopologyData() + testNs := &corev1.Namespace{ ObjectMeta: metav1.ObjectMeta{ GenerateName: "tas-test-", @@ -427,7 +395,7 @@ var _ = ginkgo.Describe("[PositiveFlow] Deployer execution", func() { gomega.Expect(err).ToNot(gomega.HaveOccurred()) ginkgo.By("checking the pod goes running") - e2epods.WaitForPodToBeRunning(cli, testPod.Namespace, testPod.Name, 2*time.Minute) + e2epods.ExpectPodToBeRunning(cli, testPod.Namespace, testPod.Name, 2*time.Minute) }) }) }) @@ -449,7 +417,11 @@ var _ = ginkgo.Describe("[PositiveFlow] Deployer partial execution", func() { "failed to deploy partial components before test started", ) if err != nil { - dumpSchedulerPods() + cli, cerr := clientutil.New() + if cerr == nil { + // don't hide the previous error + dumpSchedulerPods(context.Background(), cli) + } // else? } gomega.Expect(err).ToNot(gomega.HaveOccurred()) @@ -467,7 +439,10 @@ var _ = ginkgo.Describe("[PositiveFlow] Deployer partial execution", func() { gomega.Expect(err).ToNot(gomega.HaveOccurred()) }() - expectSchedulerRunning() + cli, err := clientutil.New() + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + + expectSchedulerRunning(context.Background(), cli) }) ginkgo.It("should perform the deployment of scheduler plugin (with extreme verbosity) + API and verify all pods are running", func() { @@ -484,7 +459,11 @@ var _ = ginkgo.Describe("[PositiveFlow] Deployer partial execution", func() { "failed to deploy partial components before test started", ) if err != nil { - dumpSchedulerPods() + cli, cerr := clientutil.New() + if cerr == nil { + // don't hide the previous error + dumpSchedulerPods(context.Background(), cli) + } // else? } gomega.Expect(err).ToNot(gomega.HaveOccurred()) @@ -502,108 +481,10 @@ var _ = ginkgo.Describe("[PositiveFlow] Deployer partial execution", func() { gomega.Expect(err).ToNot(gomega.HaveOccurred()) }() - expectSchedulerRunning() + cli, err := clientutil.New() + gomega.Expect(err).ToNot(gomega.HaveOccurred()) + + expectSchedulerRunning(context.Background(), cli) }) }) }) - -func dumpSchedulerPods() { - ns, err := manifests.Namespace(manifests.ComponentSchedulerPlugin) - gomega.ExpectWithOffset(1, err).ToNot(gomega.HaveOccurred()) - - // TODO: autodetect the platform - mfs, err := sched.GetManifests(platform.Kubernetes, ns.Name) - gomega.ExpectWithOffset(1, err).ToNot(gomega.HaveOccurred()) - mfs, err = mfs.Render(logr.Discard(), options.Scheduler{ - Replicas: int32(1), - }) - gomega.ExpectWithOffset(1, err).ToNot(gomega.HaveOccurred()) - - cli, err := clientutil.New() - gomega.ExpectWithOffset(1, err).ToNot(gomega.HaveOccurred()) - - k8sCli, err := clientutil.NewK8s() - gomega.ExpectWithOffset(1, err).ToNot(gomega.HaveOccurred()) - - ctx := context.Background() - - pods, err := e2epods.GetByDeployment(cli, ctx, *mfs.DPScheduler) - gomega.ExpectWithOffset(1, err).ToNot(gomega.HaveOccurred()) - - klog.Warning(">>> scheduler pod status begin:\n") - for idx := range pods { - pod := &pods[idx] - - // TODO - pod.ManagedFields = nil - // TODO - - data, err := yaml.Marshal(pod) - gomega.ExpectWithOffset(1, err).ToNot(gomega.HaveOccurred()) - - klog.Warningf("%s\n---\n", string(data)) - - e2epods.LogEventsForPod(k8sCli, ctx, pod.Namespace, pod.Name) - klog.Warningf("---\n") - } - klog.Warning(">>> scheduler pod status end\n") -} - -func expectSchedulerRunning() { - ns, err := manifests.Namespace(manifests.ComponentSchedulerPlugin) - gomega.ExpectWithOffset(1, err).ToNot(gomega.HaveOccurred()) - - cli, err := clientutil.New() - gomega.ExpectWithOffset(1, err).ToNot(gomega.HaveOccurred()) - - ctx := context.Background() - - ginkgo.By("checking that scheduler plugin is configured") - - confMap := corev1.ConfigMap{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: ns.Name, - Name: "scheduler-config", // TODO: duplicate from YAML - }, - } - err = cli.Get(ctx, client.ObjectKeyFromObject(&confMap), &confMap) - gomega.ExpectWithOffset(1, err).ToNot(gomega.HaveOccurred()) - gomega.ExpectWithOffset(1, confMap.Data).ToNot(gomega.BeNil(), "empty config map for scheduler config") - - data, ok := confMap.Data[manifests.SchedulerConfigFileName] - gomega.ExpectWithOffset(1, ok).To(gomega.BeTrue(), "empty config data for %q", manifests.SchedulerConfigFileName) - - allParams, err := manifests.DecodeSchedulerProfilesFromData([]byte(data)) - gomega.ExpectWithOffset(1, len(allParams)).To(gomega.Equal(1), "unexpected params: %#v", allParams) - - params := allParams[0] // TODO: smarter find - gomega.ExpectWithOffset(1, err).ToNot(gomega.HaveOccurred()) - gomega.ExpectWithOffset(1, params.Cache).ToNot(gomega.BeNil(), "no data for scheduler cache config") - gomega.ExpectWithOffset(1, params.Cache.ResyncPeriodSeconds).ToNot(gomega.BeNil(), "no data for scheduler cache resync period") - - ginkgo.By("checking that scheduler plugin is running") - - ginkgo.By("checking that topo-aware-scheduler pod is running") - // TODO: autodetect the platform - mfs, err := sched.GetManifests(platform.Kubernetes, ns.Name) - gomega.ExpectWithOffset(1, err).ToNot(gomega.HaveOccurred()) - mfs, err = mfs.Render(logr.Discard(), options.Scheduler{ - Replicas: int32(1), - }) - gomega.ExpectWithOffset(1, err).ToNot(gomega.HaveOccurred()) - - var wg sync.WaitGroup - for _, dp := range []*appsv1.Deployment{ - mfs.DPScheduler, - mfs.DPController, - } { - wg.Add(1) - go func(dp *appsv1.Deployment) { - defer ginkgo.GinkgoRecover() - defer wg.Done() - _, err = wait.With(cli, logr.Discard()).Interval(10*time.Second).Timeout(3*time.Minute).ForDeploymentComplete(ctx, dp) - gomega.ExpectWithOffset(1, err).ToNot(gomega.HaveOccurred()) - }(dp) - } - wg.Wait() -} diff --git a/test/e2e/utils/pods/logs.go b/test/e2e/utils/pods/logs.go new file mode 100644 index 00000000..48d6f233 --- /dev/null +++ b/test/e2e/utils/pods/logs.go @@ -0,0 +1,31 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Copyright 2024 Red Hat, Inc. + */ + +package pods + +import ( + "context" + "strconv" + + "k8s.io/client-go/kubernetes" +) + +func GetLogsForPod(k8sCli kubernetes.Interface, podNamespace, podName, containerName string) (string, error) { + previous := false + request := k8sCli.CoreV1().RESTClient().Get().Resource("pods").Namespace(podNamespace).Name(podName).SubResource("log").Param("container", containerName).Param("previous", strconv.FormatBool(previous)) + logs, err := request.Do(context.TODO()).Raw() + return string(logs), err +} diff --git a/test/e2e/utils/pods/pods.go b/test/e2e/utils/pods/pods.go index 4c4594c6..c6fcab0e 100644 --- a/test/e2e/utils/pods/pods.go +++ b/test/e2e/utils/pods/pods.go @@ -30,6 +30,7 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + k8swait "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/kubernetes" "sigs.k8s.io/controller-runtime/pkg/client" @@ -69,26 +70,34 @@ func GuaranteedSleeperPod(namespace, schedulerName string) *corev1.Pod { } } -func WaitForPodToBeRunning(cli *kubernetes.Clientset, podNamespace, podName string, timeout time.Duration) *corev1.Pod { +func WaitForPodToBeRunning(ctx context.Context, cli kubernetes.Interface, podNamespace, podName string, timeout time.Duration) (*corev1.Pod, error) { var err error var pod *corev1.Pod startTime := time.Now() - gomega.EventuallyWithOffset(1, func() error { - pod, err = cli.CoreV1().Pods(podNamespace).Get(context.TODO(), podName, metav1.GetOptions{}) - if err != nil { - return err + err = k8swait.PollUntilContextTimeout(ctx, 10*time.Second, timeout, true, func(fctx context.Context) (bool, error) { + var err2 error + pod, err2 = cli.CoreV1().Pods(podNamespace).Get(fctx, podName, metav1.GetOptions{}) + if err2 != nil { + return false, err2 } switch pod.Status.Phase { case corev1.PodFailed, corev1.PodSucceeded: - return fmt.Errorf("pod %q status %q which is unexpected", podName, pod.Status.Phase) + return false, fmt.Errorf("pod %q status %q which is unexpected", podName, pod.Status.Phase) case corev1.PodRunning: fmt.Fprintf(ginkgo.GinkgoWriter, "Pod %q is running! (took %v)\n", podName, time.Since(startTime)) - return nil + return true, nil } msg := fmt.Sprintf("pod %q status %q, waiting for it to be Running (with Ready = true)", podName, pod.Status.Phase) fmt.Fprintln(ginkgo.GinkgoWriter, msg) - return errors.New(msg) - }, timeout, 10*time.Second).ShouldNot(gomega.HaveOccurred()) + return false, nil + }) + return pod, err +} + +func ExpectPodToBeRunning(cli kubernetes.Interface, podNamespace, podName string, timeout time.Duration) *corev1.Pod { + ginkgo.GinkgoHelper() + pod, err := WaitForPodToBeRunning(context.Background(), cli, podNamespace, podName, timeout) + gomega.Expect(err).ToNot(gomega.HaveOccurred()) return pod } @@ -153,3 +162,18 @@ func GetByDeployment(cli client.Client, ctx context.Context, deployment appsv1.D return podList.Items, nil } + +func GetByDaemonSet(cli client.Client, ctx context.Context, daemonset appsv1.DaemonSet) ([]corev1.Pod, error) { + podList := &corev1.PodList{} + sel, err := metav1.LabelSelectorAsSelector(daemonset.Spec.Selector) + if err != nil { + return nil, err + } + + err = cli.List(ctx, podList, &client.ListOptions{Namespace: daemonset.Namespace, LabelSelector: sel}) + if err != nil { + return nil, err + } + + return podList.Items, nil +}