Skip to content

Commit

Permalink
Add variable for setting the GPUs on cluster
Browse files Browse the repository at this point in the history
The change also includes a patch to set the GPUs on
nutanixmachinetemplate resources for control plane and worker
machine deployments.
  • Loading branch information
thunderboltsid committed Apr 4, 2024
1 parent 2e57c08 commit 401262f
Show file tree
Hide file tree
Showing 4 changed files with 226 additions and 15 deletions.
57 changes: 57 additions & 0 deletions templates/cluster-template-clusterclass.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,19 @@ spec:
matchResources:
controlPlane: true
name: update-control-plane-machine-template
- definitions:
- jsonPatches:
- op: add
path: /spec/template/spec/gpus
valueFrom:
variable: controlPlaneMachineDetails.gpus
selector:
apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
kind: NutanixMachineTemplate
matchResources:
controlPlane: true
enabledIf: '{{if .controlPlaneMachineDetails.gpus}}true{{end}}'
name: update-control-plane-machine-template-gpus
- definitions:
- jsonPatches:
- op: add
Expand Down Expand Up @@ -224,6 +237,21 @@ spec:
names:
- nutanix-quick-start-worker
name: update-worker-machine-template
- definitions:
- jsonPatches:
- op: add
path: /spec/template/spec/gpus
valueFrom:
variable: workerMachineDetails.gpus
selector:
apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
kind: NutanixMachineTemplate
matchResources:
machineDeploymentClass:
names:
- nutanix-quick-start-worker
enabledIf: '{{if .workerMachineDetails.gpus}}true{{end}}'
name: update-worker-machine-template-gpus
- definitions:
- jsonPatches:
- op: replace
Expand Down Expand Up @@ -308,6 +336,7 @@ spec:
required: true
schema:
openAPIV3Schema:
description: IP and port of the control plane endpoint.
properties:
IP:
type: string
Expand All @@ -318,6 +347,7 @@ spec:
required: true
schema:
openAPIV3Schema:
description: Endpoint and credentials of the Prism Central.
properties:
additionalTrustBundle:
type: string
Expand All @@ -334,11 +364,23 @@ spec:
required: true
schema:
openAPIV3Schema:
description: Details of the control plane machine deployment.
properties:
bootType:
type: string
clusterName:
type: string
gpus:
items:
properties:
deviceID:
type: integer
name:
type: string
type:
type: string
type: object
type: array
imageName:
type: string
memorySize:
Expand All @@ -356,11 +398,23 @@ spec:
required: true
schema:
openAPIV3Schema:
description: Details of the worker machine deployment.
properties:
bootType:
type: string
clusterName:
type: string
gpus:
items:
properties:
deviceID:
type: integer
name:
type: string
type:
type: string
type: object
type: array
imageName:
type: string
memorySize:
Expand All @@ -378,6 +432,7 @@ spec:
required: false
schema:
openAPIV3Schema:
description: List of failure domains to be used in the cluster.
items:
properties:
cluster:
Expand Down Expand Up @@ -432,6 +487,8 @@ spec:
required: false
schema:
openAPIV3Schema:
description: Additional categories to be added to the machine deployment in
cluster.
items:
properties:
key:
Expand Down
56 changes: 56 additions & 0 deletions templates/clusterclass/clusterclass.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,19 @@ spec:
template: |
- type: name
name: {{ .controlPlaneMachineDetails.subnetName }}
- name: update-control-plane-machine-template-gpus
enabledIf: "{{if .controlPlaneMachineDetails.gpus}}true{{end}}"
definitions:
- selector:
apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
kind: NutanixMachineTemplate
matchResources:
controlPlane: true
jsonPatches:
- op: add
path: /spec/template/spec/gpus
valueFrom:
variable: controlPlaneMachineDetails.gpus
- name: update-worker-machine-template
definitions:
- selector:
Expand Down Expand Up @@ -240,6 +253,21 @@ spec:
template: |
- type: name
name: {{ .controlPlaneMachineDetails.subnetName }}
- name: update-worker-machine-template-gpus
enabledIf: "{{if .workerMachineDetails.gpus}}true{{end}}"
definitions:
- selector:
apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
kind: NutanixMachineTemplate
matchResources:
machineDeploymentClass:
names:
- nutanix-quick-start-worker
jsonPatches:
- op: add
path: /spec/template/spec/gpus
valueFrom:
variable: workerMachineDetails.gpus
- name: add-failure-domains
enabledIf: "{{if .failureDomains}}true{{end}}"
definitions:
Expand Down Expand Up @@ -324,6 +352,7 @@ spec:
required: true
schema:
openAPIV3Schema:
description: IP and port of the control plane endpoint.
properties:
IP:
type: string
Expand All @@ -334,6 +363,7 @@ spec:
required: true
schema:
openAPIV3Schema:
description: Endpoint and credentials of the Prism Central.
properties:
address:
type: string
Expand All @@ -350,6 +380,7 @@ spec:
required: true
schema:
openAPIV3Schema:
description: Details of the control plane machine deployment.
properties:
bootType:
type: string
Expand All @@ -367,11 +398,23 @@ spec:
type: string
subnetName:
type: string
gpus:
type: array
items:
type: object
properties:
name:
type: string
deviceID:
type: integer
type:
type: string
type: object
- name: workerMachineDetails
required: true
schema:
openAPIV3Schema:
description: Details of the worker machine deployment.
properties:
bootType:
type: string
Expand All @@ -389,11 +432,23 @@ spec:
type: string
subnetName:
type: string
gpus:
type: array
items:
type: object
properties:
name:
type: string
deviceID:
type: integer
type:
type: string
type: object
- name: failureDomains
required: false
schema:
openAPIV3Schema:
description: List of failure domains to be used in the cluster.
type: array
items:
type: object
Expand Down Expand Up @@ -448,6 +503,7 @@ spec:
required: false
schema:
openAPIV3Schema:
description: Additional categories to be added to the machine deployment in cluster.
type: array
items:
type: object
Expand Down
70 changes: 55 additions & 15 deletions templates/template_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"k8s.io/client-go/kubernetes/scheme"
"k8s.io/client-go/tools/clientcmd"
"k8s.io/klog/v2/textlogger"
"k8s.io/utils/ptr"
capiv1 "sigs.k8s.io/cluster-api/api/v1beta1"
clusterctllog "sigs.k8s.io/cluster-api/cmd/clusterctl/log"
controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1"
Expand Down Expand Up @@ -238,6 +239,29 @@ func fetchControlPlaneMachineTemplate(clnt client.Client, clusterName string) (*
return nil, fmt.Errorf("no control plane NutanixMachineTemplate found for cluster %s", clusterName)
}

func fetchWorkerMachineTemplates(clnt client.Client, clusterName string) ([]*v1beta1.NutanixMachineTemplate, error) {
nmts, err := fetchMachineTemplates(clnt, clusterName)
if err != nil {
return nil, err
}

kcp, err := fetchKubeadmControlPlane(clnt, clusterName)
if err != nil {
return nil, err
}

workerNmts := make([]*v1beta1.NutanixMachineTemplate, 0)
for _, nmt := range nmts {
if nmt.ObjectMeta.Name == kcp.Spec.MachineTemplate.InfrastructureRef.Name {
continue
}

workerNmts = append(workerNmts, nmt)
}

return workerNmts, nil
}

func TestClusterClassTemplateSuite(t *testing.T) {
RegisterFailHandler(Fail)
BeforeSuite(func() {
Expand Down Expand Up @@ -338,11 +362,6 @@ var _ = Describe("Cluster Class Template Patches Test Suite", Ordered, func() {
err = clnt.Create(context.Background(), obj) // Create the cluster
Expect(err).NotTo(HaveOccurred())

Eventually(func() error {
_, err = fetchNutanixCluster(clnt, obj.GetName())
return err
}).Within(time.Minute).Should(Succeed())

Eventually(func() ([]*v1beta1.NutanixMachineTemplate, error) {
return fetchMachineTemplates(clnt, obj.GetName())
}).Within(time.Minute).Should(And(HaveLen(2),
Expand All @@ -361,11 +380,6 @@ var _ = Describe("Cluster Class Template Patches Test Suite", Ordered, func() {
err = clnt.Create(context.Background(), obj) // Create the cluster
Expect(err).NotTo(HaveOccurred())

Eventually(func() error {
_, err = fetchNutanixCluster(clnt, obj.GetName())
return err
}).Within(time.Minute).Should(Succeed())

Eventually(func() ([]*v1beta1.NutanixMachineTemplate, error) {
return fetchMachineTemplates(clnt, obj.GetName())
}).Within(time.Minute).Should(And(HaveLen(2),
Expand All @@ -384,11 +398,6 @@ var _ = Describe("Cluster Class Template Patches Test Suite", Ordered, func() {
err = clnt.Create(context.Background(), obj) // Create the cluster
Expect(err).NotTo(HaveOccurred())

Eventually(func() error {
_, err = fetchNutanixCluster(clnt, obj.GetName())
return err
}).Within(time.Minute).Should(Succeed())

Eventually(func() ([]*v1beta1.NutanixMachineTemplate, error) {
return fetchMachineTemplates(clnt, obj.GetName())
}).Within(time.Minute).Should(And(HaveLen(2),
Expand All @@ -400,4 +409,35 @@ var _ = Describe("Cluster Class Template Patches Test Suite", Ordered, func() {
})))))
})
})

Describe("patches for GPUs", func() {
It("should have correct GPUs", func() {
clusterManifest := "testdata/cluster-with-gpu.yaml"
obj, err := getClusterManifest(clusterManifest)
Expect(err).NotTo(HaveOccurred())

err = clnt.Create(context.Background(), obj) // Create the cluster
Expect(err).NotTo(HaveOccurred())

Eventually(func() (*v1beta1.NutanixMachineTemplate, error) {
return fetchControlPlaneMachineTemplate(clnt, obj.GetName())
}).Within(time.Minute).Should(And(HaveExistingField("Spec.Template.Spec.GPUs"),
HaveField("Spec.Template.Spec.GPUs", HaveLen(1)),
HaveField("Spec.Template.Spec.GPUs", ContainElement(v1beta1.NutanixGPU{
Type: v1beta1.NutanixGPUIdentifierDeviceID,
DeviceID: ptr.To(int64(42)),
}))))

Eventually(func() ([]*v1beta1.NutanixMachineTemplate, error) {
return fetchWorkerMachineTemplates(clnt, obj.GetName())
}).Within(time.Minute).Should(And(HaveLen(1),
HaveEach(HaveExistingField("Spec.Template.Spec.GPUs")),
HaveEach(HaveField("Spec.Template.Spec.GPUs", HaveLen(1))),
HaveEach(HaveField("Spec.Template.Spec.GPUs", ContainElement(v1beta1.NutanixGPU{
Type: v1beta1.NutanixGPUIdentifierName,
Name: ptr.To("fake-gpu"),
}))),
))
})
})
})
Loading

0 comments on commit 401262f

Please sign in to comment.