Skip to content

Commit

Permalink
e2e: add facilities to improve troubleshooting
Browse files Browse the repository at this point in the history
Add helpers to make it easier to understand
the test failures.

Signed-off-by: Francesco Romani <[email protected]>
  • Loading branch information
ffromani committed Mar 5, 2024
1 parent ccb2fb9 commit beaebdf
Show file tree
Hide file tree
Showing 5 changed files with 348 additions and 168 deletions.
268 changes: 263 additions & 5 deletions test/e2e/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,22 @@ import (
"runtime"
"strconv"
"strings"
"sync"
"testing"
"time"

"github.com/go-logr/logr"
"github.com/onsi/ginkgo/v2"
"github.com/onsi/gomega"
"sigs.k8s.io/controller-runtime/pkg/client"
ctrllog "sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/yaml"

appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
k8swait "k8s.io/apimachinery/pkg/util/wait"
"k8s.io/klog/v2"
"k8s.io/klog/v2/klogr"

"github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/apis/topology/v1alpha2"
Expand All @@ -44,9 +49,17 @@ import (
"github.com/k8stopologyawareschedwg/podfingerprint"

"github.com/k8stopologyawareschedwg/deployer/pkg/clientutil"
"github.com/k8stopologyawareschedwg/deployer/pkg/clientutil/nodes"
"github.com/k8stopologyawareschedwg/deployer/pkg/deployer"
"github.com/k8stopologyawareschedwg/deployer/pkg/deployer/platform"
"github.com/k8stopologyawareschedwg/deployer/pkg/deployer/wait"
"github.com/k8stopologyawareschedwg/deployer/pkg/manifests"
"github.com/k8stopologyawareschedwg/deployer/pkg/manifests/rte"
"github.com/k8stopologyawareschedwg/deployer/pkg/manifests/sched"
"github.com/k8stopologyawareschedwg/deployer/pkg/options"
"github.com/k8stopologyawareschedwg/deployer/pkg/stringify"
"github.com/k8stopologyawareschedwg/deployer/pkg/validator"
e2epods "github.com/k8stopologyawareschedwg/deployer/test/e2e/utils/pods"
)

var (
Expand Down Expand Up @@ -96,8 +109,12 @@ func checkHasCPU(nrt *v1alpha2.NodeResourceTopology) error {
ok := false
for _, zone := range nrt.Zones {
for _, resource := range zone.Resources {
if resource.Name == string(corev1.ResourceCPU) && resource.Capacity.Size() >= 1 {
ok = true
if resource.Name == string(corev1.ResourceCPU) {
if resource.Capacity.Value() >= 2 {
ok = true
} else {
klog.Warningf("zone %q not enough CPU", zone.Name)
}
}
}
}
Expand Down Expand Up @@ -132,7 +149,7 @@ func getNodeResourceTopology(tc topologyclientset.Interface, name string, filter
var err error
var nrt *v1alpha2.NodeResourceTopology
fmt.Fprintf(ginkgo.GinkgoWriter, "looking for noderesourcetopology %q\n", name)
err = wait.PollImmediate(5*time.Second, 1*time.Minute, func() (bool, error) {
err = k8swait.PollImmediate(5*time.Second, 1*time.Minute, func() (bool, error) {
nrt, err = tc.TopologyV1alpha2().NodeResourceTopologies().Get(context.TODO(), name, metav1.GetOptions{})
if err != nil {
return false, err
Expand All @@ -149,6 +166,31 @@ func getNodeResourceTopology(tc topologyclientset.Interface, name string, filter
return nrt
}

func expectNRTData() {
ginkgo.GinkgoHelper()

ginkgo.By("checking that noderesourcetopolgy has some information in it")
tc, err := clientutil.NewTopologyClient()
gomega.Expect(err).ToNot(gomega.HaveOccurred())

workers, err := nodes.GetWorkers(NullEnv())
gomega.Expect(err).ToNot(gomega.HaveOccurred())
for _, node := range workers {
ginkgo.By(fmt.Sprintf("checking node resource topology for %q", node.Name))

// the name of the nrt object is the same as the worker node's name
_ = getNodeResourceTopology(tc, node.Name, func(nrt *v1alpha2.NodeResourceTopology) error {
if err := checkHasCPU(nrt); err != nil {
return err
}
if err := checkHasPFP(nrt); err != nil {
return err
}
return nil
})
}
}

func ensureNodeResourceTopology(tc topologyclientset.Interface, name string, filterFunc func(nrt *v1alpha2.NodeResourceTopology) error) error {
fmt.Fprintf(ginkgo.GinkgoWriter, "ensuring predicate for noderesourcetopology %q\n", name)
var err error
Expand Down Expand Up @@ -201,7 +243,9 @@ func deploy(updaterType string, pfpEnable bool) error {
"deploy",
"--rte-config-file=" + filepath.Join(deployerBaseDir, "hack", "rte.yaml"),
"--updater-pfp-enable=" + strconv.FormatBool(pfpEnable),
"--updater-verbose=5",
"--sched-ctrlplane-affinity=false",
"--sched-verbose=5",
"--wait",
}
if updaterType != "" {
Expand Down Expand Up @@ -244,12 +288,226 @@ func runCmdline(cmdline []string, errMsg string) error {
}

func NullEnv() *deployer.Environment {
ginkgo.GinkgoHelper()

cli, err := clientutil.New()
gomega.ExpectWithOffset(1, err).ToNot(gomega.HaveOccurred())
gomega.Expect(err).ToNot(gomega.HaveOccurred())
env := deployer.Environment{
Ctx: context.TODO(),
Cli: cli,
Log: logr.Discard(),
}
return &env
}

func dumpNRTInfo(cli client.Client) {
ginkgo.GinkgoHelper()

tc, err := clientutil.NewTopologyClient()
gomega.Expect(err).ToNot(gomega.HaveOccurred())

workers, err := nodes.GetWorkers(&deployer.Environment{
Ctx: context.TODO(),
Cli: cli,
Log: logr.Discard(),
})

gomega.Expect(err).ToNot(gomega.HaveOccurred())
for _, node := range workers {
// the name of the nrt object is the same as the worker node's name
nrt := getNodeResourceTopology(tc, node.Name, func(nrt *v1alpha2.NodeResourceTopology) error {
return nil
})

data, err := yaml.Marshal(nrt)
gomega.Expect(err).ToNot(gomega.HaveOccurred())

klog.Infof("NRT data for %s:\n---\n%s\n---\n", node.Name, string(data))
}
}

func dumpSchedulerPods() {
ginkgo.GinkgoHelper()

ns, err := manifests.Namespace(manifests.ComponentSchedulerPlugin)
gomega.Expect(err).ToNot(gomega.HaveOccurred())

// TODO: autodetect the platform
mfs, err := sched.GetManifests(platform.Kubernetes, ns.Name)
gomega.Expect(err).ToNot(gomega.HaveOccurred())
mfs, err = mfs.Render(logr.Discard(), options.Scheduler{
Replicas: int32(1),
})
gomega.Expect(err).ToNot(gomega.HaveOccurred())

cli, err := clientutil.New()
gomega.Expect(err).ToNot(gomega.HaveOccurred())

dumpNRTInfo(cli)

k8sCli, err := clientutil.NewK8s()
gomega.Expect(err).ToNot(gomega.HaveOccurred())

ctx := context.Background()

pods, err := e2epods.GetByDeployment(cli, ctx, *mfs.DPScheduler)
gomega.Expect(err).ToNot(gomega.HaveOccurred())

klog.Warning(">>> scheduler pod status begin:\n")
if len(pods) > 1 {
klog.Warningf("UNEXPECTED POD COUNT %d: dumping only the first", len(pods))
}
if len(pods) > 0 {
pod := pods[0].DeepCopy()
pod.ManagedFields = nil

data, err := yaml.Marshal(pod)
gomega.Expect(err).ToNot(gomega.HaveOccurred())

klog.Warningf("\n---\n%s\n---\n", string(data))

logs, err := e2epods.GetLogsForPod(k8sCli, pod.Namespace, pod.Name, pod.Spec.Containers[0].Name)
if err == nil {
// skip errors until we can autodetect the CM key
klog.Infof(">>> scheduler logs begin:\n%s\n>>> scheduler logs end", logs)
}
}

var cm corev1.ConfigMap
key := client.ObjectKey{
Namespace: "tas-scheduler",
Name: "scheduler-config",
}
err = cli.Get(ctx, key, &cm)
if err == nil {
// skip errors until we can autodetect the CM key
klog.Infof("scheduler config:\n%s", cm.Data["scheduler-config.yaml"])
}

klog.Warning(">>> scheduler pod status end\n")
}

func dumpWorkloadPods(pod *corev1.Pod) {
ginkgo.GinkgoHelper()

pod = pod.DeepCopy()

k8sCli, err := clientutil.NewK8s()
gomega.Expect(err).ToNot(gomega.HaveOccurred())

ctx := context.Background()

klog.Warning(">>> workload pod status begin:\n")
pod.ManagedFields = nil

data, err := yaml.Marshal(pod)
gomega.Expect(err).ToNot(gomega.HaveOccurred())

klog.Warningf("%s\n---\n", string(data))

e2epods.LogEventsForPod(k8sCli, ctx, pod.Namespace, pod.Name)
klog.Warningf("---\n")
klog.Warning(">>> workload pod status end\n")
}

func dumpResourceTopologyExporterPods() {
ginkgo.GinkgoHelper()

ns, err := manifests.Namespace(manifests.ComponentResourceTopologyExporter)
gomega.Expect(err).ToNot(gomega.HaveOccurred())

// TODO: autodetect the platform
mfs, err := rte.GetManifests(platform.Kubernetes, platform.Version("1.23"), ns.Name, true)
gomega.Expect(err).ToNot(gomega.HaveOccurred())
mfs, err = mfs.Render(options.UpdaterDaemon{
Namespace: ns.Name,
})
gomega.Expect(err).ToNot(gomega.HaveOccurred())

cli, err := clientutil.New()
gomega.Expect(err).ToNot(gomega.HaveOccurred())

k8sCli, err := clientutil.NewK8s()
gomega.Expect(err).ToNot(gomega.HaveOccurred())

ctx := context.Background()

pods, err := e2epods.GetByDaemonSet(cli, ctx, *mfs.DaemonSet)
gomega.Expect(err).ToNot(gomega.HaveOccurred())

klog.Warning(">>> RTE pod status begin:\n")
if len(pods) > 1 {
klog.Warningf("UNEXPECTED POD COUNT %d: dumping only the first", len(pods))
}
if len(pods) > 0 {
pod := pods[0].DeepCopy()
pod.ManagedFields = nil

logs, err := e2epods.GetLogsForPod(k8sCli, pod.Namespace, pod.Name, pod.Spec.Containers[0].Name)
if err == nil {
// skip errors until we can autodetect the CM key
klog.Infof(">>> RTE logs begin:\n%s\n>>> RTE logs end", logs)
}
}

klog.Warning(">>> RTE pod status end\n")
}

func expectSchedulerRunning() {
ns, err := manifests.Namespace(manifests.ComponentSchedulerPlugin)
gomega.ExpectWithOffset(1, err).ToNot(gomega.HaveOccurred())

cli, err := clientutil.New()
gomega.ExpectWithOffset(1, err).ToNot(gomega.HaveOccurred())

ctx := context.Background()

ginkgo.By("checking that scheduler plugin is configured")

confMap := corev1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Namespace: ns.Name,
Name: "scheduler-config", // TODO: duplicate from YAML
},
}
err = cli.Get(ctx, client.ObjectKeyFromObject(&confMap), &confMap)
gomega.ExpectWithOffset(1, err).ToNot(gomega.HaveOccurred())
gomega.ExpectWithOffset(1, confMap.Data).ToNot(gomega.BeNil(), "empty config map for scheduler config")

data, ok := confMap.Data[manifests.SchedulerConfigFileName]
gomega.ExpectWithOffset(1, ok).To(gomega.BeTrue(), "empty config data for %q", manifests.SchedulerConfigFileName)

allParams, err := manifests.DecodeSchedulerProfilesFromData([]byte(data))
gomega.ExpectWithOffset(1, len(allParams)).To(gomega.Equal(1), "unexpected params: %#v", allParams)

params := allParams[0] // TODO: smarter find
gomega.ExpectWithOffset(1, err).ToNot(gomega.HaveOccurred())
gomega.ExpectWithOffset(1, params.Cache).ToNot(gomega.BeNil(), "no data for scheduler cache config")
gomega.ExpectWithOffset(1, params.Cache.ResyncPeriodSeconds).ToNot(gomega.BeNil(), "no data for scheduler cache resync period")

ginkgo.By("checking that scheduler plugin is running")

ginkgo.By("checking that topo-aware-scheduler pod is running")
// TODO: autodetect the platform
mfs, err := sched.GetManifests(platform.Kubernetes, ns.Name)
gomega.ExpectWithOffset(1, err).ToNot(gomega.HaveOccurred())
mfs, err = mfs.Render(logr.Discard(), options.Scheduler{
Replicas: int32(1),
})
gomega.ExpectWithOffset(1, err).ToNot(gomega.HaveOccurred())

var wg sync.WaitGroup
for _, dp := range []*appsv1.Deployment{
mfs.DPScheduler,
mfs.DPController,
} {
wg.Add(1)
go func(dp *appsv1.Deployment) {
defer ginkgo.GinkgoRecover()
defer wg.Done()
_, err = wait.With(cli, logr.Discard()).Interval(10*time.Second).Timeout(3*time.Minute).ForDeploymentComplete(ctx, dp)
gomega.ExpectWithOffset(1, err).ToNot(gomega.HaveOccurred())
}(dp)
}
wg.Wait()
}
34 changes: 10 additions & 24 deletions test/e2e/manifests.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,7 @@ import (
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/apis/topology/v1alpha2"

"github.com/k8stopologyawareschedwg/deployer/pkg/clientutil"
"github.com/k8stopologyawareschedwg/deployer/pkg/clientutil/nodes"
"github.com/k8stopologyawareschedwg/deployer/pkg/deployer/platform"
"github.com/k8stopologyawareschedwg/deployer/pkg/manifests"
"github.com/k8stopologyawareschedwg/deployer/pkg/manifests/rte"
Expand Down Expand Up @@ -80,26 +77,7 @@ var _ = ginkgo.Describe("[ManifestFlow] Deployer rendering", func() {
gomega.Expect(err).ToNot(gomega.HaveOccurred())
e2epods.WaitPodsToBeRunningByRegex(fmt.Sprintf("%s-*", mfs.DPScheduler.Name))

ginkgo.By("checking that noderesourcetopolgy has some information in it")
tc, err := clientutil.NewTopologyClient()
gomega.Expect(err).ToNot(gomega.HaveOccurred())

workers, err := nodes.GetWorkers(NullEnv())
gomega.Expect(err).ToNot(gomega.HaveOccurred())
for _, node := range workers {
ginkgo.By(fmt.Sprintf("checking node resource topology for %q", node.Name))

// the name of the nrt object is the same as the worker node's name
_ = getNodeResourceTopology(tc, node.Name, func(nrt *v1alpha2.NodeResourceTopology) error {
if err := checkHasCPU(nrt); err != nil {
return err
}
if err := checkHasPFP(nrt); err != nil {
return err
}
return nil
})
}
expectNRTData()
})

ginkgo.It("should verify a test pod scheduled with the topology aware scheduler goes running", func() {
Expand All @@ -122,6 +100,8 @@ var _ = ginkgo.Describe("[ManifestFlow] Deployer rendering", func() {
ginkgo.Skip("skipping the pod check - not enough resources")
}

expectNRTData()

testNs := &corev1.Namespace{
ObjectMeta: metav1.ObjectMeta{
GenerateName: "tas-test-",
Expand All @@ -141,7 +121,13 @@ var _ = ginkgo.Describe("[ManifestFlow] Deployer rendering", func() {
gomega.Expect(err).ToNot(gomega.HaveOccurred())

ginkgo.By("checking the pod goes running")
e2epods.WaitForPodToBeRunning(cli, testPod.Namespace, testPod.Name, 2*time.Minute)
updatedPod, err := e2epods.WaitForPodToBeRunning(context.TODO(), cli, testPod.Namespace, testPod.Name, 3*time.Minute)
if err != nil {
dumpResourceTopologyExporterPods()
dumpSchedulerPods()
dumpWorkloadPods(updatedPod)
}
gomega.Expect(err).ToNot(gomega.HaveOccurred())
})
})
})
Expand Down
Loading

0 comments on commit beaebdf

Please sign in to comment.