From 401262fd9b1f28ba0547d77bcb265e8264b87c81 Mon Sep 17 00:00:00 2001 From: Sid Shukla Date: Wed, 20 Mar 2024 04:35:55 +0100 Subject: [PATCH] Add variable for setting the GPUs on cluster The change also includes a patch to set the GPUs on nutanixmachinetemplate resources for control plane and worker machine deployments. --- templates/cluster-template-clusterclass.yaml | 57 ++++++++++++++++ templates/clusterclass/clusterclass.yaml | 56 ++++++++++++++++ templates/template_test.go | 70 +++++++++++++++----- templates/testdata/cluster-with-gpu.yaml | 58 ++++++++++++++++ 4 files changed, 226 insertions(+), 15 deletions(-) create mode 100644 templates/testdata/cluster-with-gpu.yaml diff --git a/templates/cluster-template-clusterclass.yaml b/templates/cluster-template-clusterclass.yaml index c98564b4e2..0a3715aa99 100644 --- a/templates/cluster-template-clusterclass.yaml +++ b/templates/cluster-template-clusterclass.yaml @@ -184,6 +184,19 @@ spec: matchResources: controlPlane: true name: update-control-plane-machine-template + - definitions: + - jsonPatches: + - op: add + path: /spec/template/spec/gpus + valueFrom: + variable: controlPlaneMachineDetails.gpus + selector: + apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 + kind: NutanixMachineTemplate + matchResources: + controlPlane: true + enabledIf: '{{if .controlPlaneMachineDetails.gpus}}true{{end}}' + name: update-control-plane-machine-template-gpus - definitions: - jsonPatches: - op: add @@ -224,6 +237,21 @@ spec: names: - nutanix-quick-start-worker name: update-worker-machine-template + - definitions: + - jsonPatches: + - op: add + path: /spec/template/spec/gpus + valueFrom: + variable: workerMachineDetails.gpus + selector: + apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 + kind: NutanixMachineTemplate + matchResources: + machineDeploymentClass: + names: + - nutanix-quick-start-worker + enabledIf: '{{if .workerMachineDetails.gpus}}true{{end}}' + name: update-worker-machine-template-gpus - definitions: - jsonPatches: - op: replace @@ -308,6 +336,7 @@ spec: required: true schema: openAPIV3Schema: + description: IP and port of the control plane endpoint. properties: IP: type: string @@ -318,6 +347,7 @@ spec: required: true schema: openAPIV3Schema: + description: Endpoint and credentials of the Prism Central. properties: additionalTrustBundle: type: string @@ -334,11 +364,23 @@ spec: required: true schema: openAPIV3Schema: + description: Details of the control plane machine deployment. properties: bootType: type: string clusterName: type: string + gpus: + items: + properties: + deviceID: + type: integer + name: + type: string + type: + type: string + type: object + type: array imageName: type: string memorySize: @@ -356,11 +398,23 @@ spec: required: true schema: openAPIV3Schema: + description: Details of the worker machine deployment. properties: bootType: type: string clusterName: type: string + gpus: + items: + properties: + deviceID: + type: integer + name: + type: string + type: + type: string + type: object + type: array imageName: type: string memorySize: @@ -378,6 +432,7 @@ spec: required: false schema: openAPIV3Schema: + description: List of failure domains to be used in the cluster. items: properties: cluster: @@ -432,6 +487,8 @@ spec: required: false schema: openAPIV3Schema: + description: Additional categories to be added to the machine deployment in + cluster. items: properties: key: diff --git a/templates/clusterclass/clusterclass.yaml b/templates/clusterclass/clusterclass.yaml index 358734de9c..469c083023 100644 --- a/templates/clusterclass/clusterclass.yaml +++ b/templates/clusterclass/clusterclass.yaml @@ -200,6 +200,19 @@ spec: template: | - type: name name: {{ .controlPlaneMachineDetails.subnetName }} + - name: update-control-plane-machine-template-gpus + enabledIf: "{{if .controlPlaneMachineDetails.gpus}}true{{end}}" + definitions: + - selector: + apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 + kind: NutanixMachineTemplate + matchResources: + controlPlane: true + jsonPatches: + - op: add + path: /spec/template/spec/gpus + valueFrom: + variable: controlPlaneMachineDetails.gpus - name: update-worker-machine-template definitions: - selector: @@ -240,6 +253,21 @@ spec: template: | - type: name name: {{ .controlPlaneMachineDetails.subnetName }} + - name: update-worker-machine-template-gpus + enabledIf: "{{if .workerMachineDetails.gpus}}true{{end}}" + definitions: + - selector: + apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 + kind: NutanixMachineTemplate + matchResources: + machineDeploymentClass: + names: + - nutanix-quick-start-worker + jsonPatches: + - op: add + path: /spec/template/spec/gpus + valueFrom: + variable: workerMachineDetails.gpus - name: add-failure-domains enabledIf: "{{if .failureDomains}}true{{end}}" definitions: @@ -324,6 +352,7 @@ spec: required: true schema: openAPIV3Schema: + description: IP and port of the control plane endpoint. properties: IP: type: string @@ -334,6 +363,7 @@ spec: required: true schema: openAPIV3Schema: + description: Endpoint and credentials of the Prism Central. properties: address: type: string @@ -350,6 +380,7 @@ spec: required: true schema: openAPIV3Schema: + description: Details of the control plane machine deployment. properties: bootType: type: string @@ -367,11 +398,23 @@ spec: type: string subnetName: type: string + gpus: + type: array + items: + type: object + properties: + name: + type: string + deviceID: + type: integer + type: + type: string type: object - name: workerMachineDetails required: true schema: openAPIV3Schema: + description: Details of the worker machine deployment. properties: bootType: type: string @@ -389,11 +432,23 @@ spec: type: string subnetName: type: string + gpus: + type: array + items: + type: object + properties: + name: + type: string + deviceID: + type: integer + type: + type: string type: object - name: failureDomains required: false schema: openAPIV3Schema: + description: List of failure domains to be used in the cluster. type: array items: type: object @@ -448,6 +503,7 @@ spec: required: false schema: openAPIV3Schema: + description: Additional categories to be added to the machine deployment in cluster. type: array items: type: object diff --git a/templates/template_test.go b/templates/template_test.go index de7ef1d55c..4b22f478a6 100644 --- a/templates/template_test.go +++ b/templates/template_test.go @@ -19,6 +19,7 @@ import ( "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/tools/clientcmd" "k8s.io/klog/v2/textlogger" + "k8s.io/utils/ptr" capiv1 "sigs.k8s.io/cluster-api/api/v1beta1" clusterctllog "sigs.k8s.io/cluster-api/cmd/clusterctl/log" controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" @@ -238,6 +239,29 @@ func fetchControlPlaneMachineTemplate(clnt client.Client, clusterName string) (* return nil, fmt.Errorf("no control plane NutanixMachineTemplate found for cluster %s", clusterName) } +func fetchWorkerMachineTemplates(clnt client.Client, clusterName string) ([]*v1beta1.NutanixMachineTemplate, error) { + nmts, err := fetchMachineTemplates(clnt, clusterName) + if err != nil { + return nil, err + } + + kcp, err := fetchKubeadmControlPlane(clnt, clusterName) + if err != nil { + return nil, err + } + + workerNmts := make([]*v1beta1.NutanixMachineTemplate, 0) + for _, nmt := range nmts { + if nmt.ObjectMeta.Name == kcp.Spec.MachineTemplate.InfrastructureRef.Name { + continue + } + + workerNmts = append(workerNmts, nmt) + } + + return workerNmts, nil +} + func TestClusterClassTemplateSuite(t *testing.T) { RegisterFailHandler(Fail) BeforeSuite(func() { @@ -338,11 +362,6 @@ var _ = Describe("Cluster Class Template Patches Test Suite", Ordered, func() { err = clnt.Create(context.Background(), obj) // Create the cluster Expect(err).NotTo(HaveOccurred()) - Eventually(func() error { - _, err = fetchNutanixCluster(clnt, obj.GetName()) - return err - }).Within(time.Minute).Should(Succeed()) - Eventually(func() ([]*v1beta1.NutanixMachineTemplate, error) { return fetchMachineTemplates(clnt, obj.GetName()) }).Within(time.Minute).Should(And(HaveLen(2), @@ -361,11 +380,6 @@ var _ = Describe("Cluster Class Template Patches Test Suite", Ordered, func() { err = clnt.Create(context.Background(), obj) // Create the cluster Expect(err).NotTo(HaveOccurred()) - Eventually(func() error { - _, err = fetchNutanixCluster(clnt, obj.GetName()) - return err - }).Within(time.Minute).Should(Succeed()) - Eventually(func() ([]*v1beta1.NutanixMachineTemplate, error) { return fetchMachineTemplates(clnt, obj.GetName()) }).Within(time.Minute).Should(And(HaveLen(2), @@ -384,11 +398,6 @@ var _ = Describe("Cluster Class Template Patches Test Suite", Ordered, func() { err = clnt.Create(context.Background(), obj) // Create the cluster Expect(err).NotTo(HaveOccurred()) - Eventually(func() error { - _, err = fetchNutanixCluster(clnt, obj.GetName()) - return err - }).Within(time.Minute).Should(Succeed()) - Eventually(func() ([]*v1beta1.NutanixMachineTemplate, error) { return fetchMachineTemplates(clnt, obj.GetName()) }).Within(time.Minute).Should(And(HaveLen(2), @@ -400,4 +409,35 @@ var _ = Describe("Cluster Class Template Patches Test Suite", Ordered, func() { }))))) }) }) + + Describe("patches for GPUs", func() { + It("should have correct GPUs", func() { + clusterManifest := "testdata/cluster-with-gpu.yaml" + obj, err := getClusterManifest(clusterManifest) + Expect(err).NotTo(HaveOccurred()) + + err = clnt.Create(context.Background(), obj) // Create the cluster + Expect(err).NotTo(HaveOccurred()) + + Eventually(func() (*v1beta1.NutanixMachineTemplate, error) { + return fetchControlPlaneMachineTemplate(clnt, obj.GetName()) + }).Within(time.Minute).Should(And(HaveExistingField("Spec.Template.Spec.GPUs"), + HaveField("Spec.Template.Spec.GPUs", HaveLen(1)), + HaveField("Spec.Template.Spec.GPUs", ContainElement(v1beta1.NutanixGPU{ + Type: v1beta1.NutanixGPUIdentifierDeviceID, + DeviceID: ptr.To(int64(42)), + })))) + + Eventually(func() ([]*v1beta1.NutanixMachineTemplate, error) { + return fetchWorkerMachineTemplates(clnt, obj.GetName()) + }).Within(time.Minute).Should(And(HaveLen(1), + HaveEach(HaveExistingField("Spec.Template.Spec.GPUs")), + HaveEach(HaveField("Spec.Template.Spec.GPUs", HaveLen(1))), + HaveEach(HaveField("Spec.Template.Spec.GPUs", ContainElement(v1beta1.NutanixGPU{ + Type: v1beta1.NutanixGPUIdentifierName, + Name: ptr.To("fake-gpu"), + }))), + )) + }) + }) }) diff --git a/templates/testdata/cluster-with-gpu.yaml b/templates/testdata/cluster-with-gpu.yaml new file mode 100644 index 0000000000..741dab7635 --- /dev/null +++ b/templates/testdata/cluster-with-gpu.yaml @@ -0,0 +1,58 @@ +apiVersion: cluster.x-k8s.io/v1beta1 +kind: Cluster +metadata: + labels: + ccm: nutanix + cluster.x-k8s.io/cluster-name: cluster-with-gpu + name: cluster-with-gpu +spec: + topology: + class: nutanix-quick-start + controlPlane: + metadata: {} + replicas: 1 + variables: + - name: sshKey + value: ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIMe61GqA9gqeX3zDCiwuU8zEDt3ckLnfVm8ZxN7UuFyL user@host + - name: controlPlaneEndpoint + value: + IP: 1.2.3.4 + port: 6443 + - name: prismCentralEndpoint + value: + address: prismcentral.fake + credentialSecret: nutanix-quick-start-pc-creds + insecure: false + port: 9440 + - name: controlPlaneMachineDetails + value: + bootType: legacy + clusterName: fake-cluster + imageName: ubuntu-2204-kube-v1.29.2.qcow2 + memorySize: 4Gi + subnetName: fake-subnet + systemDiskSize: 40Gi + vcpuSockets: 2 + vcpusPerSocket: 1 + gpus: + - type: deviceID + deviceID: 42 + - name: workerMachineDetails + value: + bootType: legacy + clusterName: fake-cluster + imageName: ubuntu-2204-kube-v1.29.2.qcow2 + memorySize: 4Gi + subnetName: fake-subnet + systemDiskSize: 40Gi + vcpuSockets: 2 + vcpusPerSocket: 1 + gpus: + - type: name + name: fake-gpu + version: v1.29.2 + workers: + machineDeployments: + - class: nutanix-quick-start-worker + name: md-0 + replicas: 2