Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MSP-4080: add render, reconcile rebooter and rbac #391

Merged
merged 10 commits into from
Feb 6, 2025
7 changes: 4 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ SHELL = /usr/bin/env bash -o pipefail
.SHELLFLAGS = -ec

# Limit the scope of generation otherwise it will try to generate configs for non-controller code
GENPATH = "./api/v1;./internal/controller/..."
GENPATH = "./api/v1;"

CHART_PATH = helm
CHART_OPERATOR_PATH = $(CHART_PATH)/soperator
Expand Down Expand Up @@ -78,8 +78,9 @@ help: ## Display this help.

.PHONY: manifests
manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects.
$(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths=$(GENPATH) output:crd:artifacts:config=config/crd/bases

$(CONTROLLER_GEN) crd webhook paths=$(GENPATH) output:crd:artifacts:config=config/crd/bases
$(CONTROLLER_GEN) rbac:roleName=manager-role paths="./internal/controller/clustercontroller/..." output:artifacts:config=config/rbac/clustercontroller/
$(CONTROLLER_GEN) rbac:roleName=node-configurator-role paths="./internal/rebooter/..." output:artifacts:config=config/rbac/node-configurator/
.PHONY: generate
generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.
$(CONTROLLER_GEN) object paths=$(GENPATH)
Expand Down
57 changes: 57 additions & 0 deletions api/v1/slurmcluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -690,6 +690,63 @@ type SlurmNodeWorker struct {
//
// +kubebuilder:validation:Optional
PriorityClass string `json:"priorityClass,omitempty"`
// It's alpha feature and will be moved to separate CRD in the future
// Rebooter defines the configuration for the Slurm worker node rebooter
//
// +kubebuilder:validation:Optional
Rebooter Rebooter `json:"rebooter"`
}

// Rebooter defines the configuration for the Slurm worker node rebooter
type Rebooter struct {
// enabled defines whether the rebooter is enabled
//
// +kubebuilder:validation:Optional
// +kubebuilder:default=false
Enabled bool `json:"enabled"`

// Image defines the rebooter container image
//
// +kubebuilder:validation:Optional
Image string `json:"image"`

// imagePullPolicy defines the image pull policy
//
// +kubebuilder:validation:Enum=Always;Never;IfNotPresent
// +kubebuilder:validation:Optional
// +kubebuilder:default="IfNotPresent"
ImagePullPolicy corev1.PullPolicy `json:"imagePullPolicy,omitempty"`

// Resources defines the [corev1.ResourceRequirements] for the container
//
// +kubebuilder:validation:Optional
Resources corev1.ResourceList `json:"resources,omitempty"`

// evictionMethod defines the method of eviction for the Slurm worker node
// Must be one of [drain, evict]. Now only evict is supported
//
// +kubebuilder:validation:Optional
// +kubebuilder:validation:Enum="evict"
// +kubebuilder:default="evict"
EvictionMethod string `json:"evictionMethod,omitempty"`

// logLevel defines the log level for the rebooter
//
// +kubebuilder:validation:Optional
// +kubebuilder:default="info"
// +kubebuilder:validation:Enum="debug";"info";"warn";"error"
LogLevel string `json:"logLevel,omitempty"`

// Namespace defines the namespace where the rebooter will be deployed
// By default, the same namespace as the soperator
//
// +kubebuilder:validation:Optional
Namespace string `json:"namespace,omitempty"`

// serviceAccountName defines the service account name for the rebooter
//
// +kubebuilder:validation:Optional
ServiceAccountName string `json:"serviceAccountName,omitempty"`
}

// SlurmNodeWorkerVolumes defines the volumes for the Slurm worker node
Expand Down
23 changes: 23 additions & 0 deletions api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 20 additions & 6 deletions cmd/rebooter/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ import (
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
"sigs.k8s.io/controller-runtime/pkg/webhook"

"nebius.ai/slurm-operator/internal/consts"
"nebius.ai/slurm-operator/internal/rebooter"
//+kubebuilder:scaffold:imports
)
Expand Down Expand Up @@ -167,20 +168,33 @@ func main() {
os.Exit(1)
}

rebooterParams := rebooter.RebooterParams{
ReconcileTimeout: reconcileTimeout,
}

// Rebooter is a daemonset and should only reconcile the node it is running on
nodeName := os.Getenv("REBOOTER_NODE_NAME")
if nodeName == "" {
setupLog.Error(fmt.Errorf("NODE_NAME environment variable is not set"), "unable to start manager")
rebooterParams.NodeName = os.Getenv(consts.RebooterMethodEnv)
if rebooterParams.NodeName == "" {
errorStr := fmt.Errorf("%s environment variable is not set", consts.RebooterMethodEnv)
setupLog.Error(errorStr, "unable to start manager")
os.Exit(1)
}

switch envEvictionMethod := os.Getenv(consts.RebooterMethodEnv) {
case string(consts.RebooterDrain):
setupLog.Error("Drain method is not implemented yet")
os.Exit(1)
case string(consts.RebooterEvict):
fallthrough
default:
rebooterParams.EvictionMethod = consts.RebooterEvict
}
if err = rebooter.NewRebooterReconciler(
mgr.GetClient(),
mgr.GetScheme(),
mgr.GetEventRecorderFor(rebooter.ControllerName),
reconcileTimeout,
nodeName,
).SetupWithManager(mgr, maxConcurrency, cacheSyncTimeout, nodeName); err != nil {
rebooterParams,
).SetupWithManager(mgr, maxConcurrency, cacheSyncTimeout, rebooterParams.NodeName); err != nil {
setupLog.Error(err, "unable to create controller", rebooter.ControllerName)
os.Exit(1)
}
Expand Down
57 changes: 57 additions & 0 deletions config/crd/bases/slurm.nebius.ai_slurmclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4032,6 +4032,63 @@ spec:
description: PriorityClass defines the priority class for
the Slurm worker node
type: string
rebooter:
description: |-
It's alpha feature and will be moved to separate CRD in the future
Rebooter defines the configuration for the Slurm worker node rebooter
properties:
enabled:
default: false
description: enabled defines whether the rebooter is enabled
type: boolean
evictionMethod:
default: evict
description: |-
evictionMethod defines the method of eviction for the Slurm worker node
Must be one of [drain, evict]. Now only evict is supported
enum:
- evict
type: string
image:
description: Image defines the rebooter container image
type: string
imagePullPolicy:
default: IfNotPresent
description: imagePullPolicy defines the image pull policy
enum:
- Always
- Never
- IfNotPresent
type: string
logLevel:
default: info
description: logLevel defines the log level for the rebooter
enum:
- debug
- info
- warn
- error
type: string
namespace:
description: |-
Namespace defines the namespace where the rebooter will be deployed
By default, the same namespace as the soperator
type: string
resources:
additionalProperties:
anyOf:
- type: integer
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
description: Resources defines the [corev1.ResourceRequirements]
for the container
type: object
serviceAccountName:
description: serviceAccountName defines the service account
name for the rebooter
type: string
type: object
size:
description: Size defines the number of node instances
format: int32
Expand Down
File renamed without changes.
25 changes: 25 additions & 0 deletions config/rbac/node-configurator/role.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: node-configurator-role
rules:
- apiGroups:
- ""
resources:
- nodes
- pods
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- nodes/status
verbs:
- get
- list
- patch
- update
- watch
20 changes: 20 additions & 0 deletions config/rbac/role_binding.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,23 @@ subjects:
- kind: ServiceAccount
name: controller-manager
namespace: system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app.kubernetes.io/name: clusterrolebinding
app.kubernetes.io/instance: node-configurator-rolebinding
app.kubernetes.io/component: rbac
app.kubernetes.io/created-by: slurm-operator
app.kubernetes.io/part-of: slurm-operator
app.kubernetes.io/managed-by: kustomize
name: node-configurator-rolebinding
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: node-configurator-role
subjects:
- kind: ServiceAccount
name: node-configurator
namespace: system
13 changes: 13 additions & 0 deletions config/rbac/service_account.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,16 @@ metadata:
app.kubernetes.io/managed-by: kustomize
name: controller-manager
namespace: system
---
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
app.kubernetes.io/name: serviceaccount
app.kubernetes.io/instance: rebooter
app.kubernetes.io/component: rbac
app.kubernetes.io/created-by: slurm-operator
app.kubernetes.io/part-of: slurm-operator
app.kubernetes.io/managed-by: kustomize
name: rebooter
Uburro marked this conversation as resolved.
Show resolved Hide resolved
namespace: system
3 changes: 2 additions & 1 deletion config/rbac/soperator-helm/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
resources:
- ../role.yaml
- ../clustercontroller/role.yaml
- ../node-configurator/role.yaml
1 change: 1 addition & 0 deletions helm/slurm-cluster/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -416,3 +416,4 @@ images:
slurmdbd: "cr.eu-north1.nebius.cloud/soperator/controller_slurmdbd:1.17.0-jammy-slurm24.05.5"
exporter: "cr.eu-north1.nebius.cloud/soperator/exporter:1.17.0-jammy-slurm24.05.5"
mariaDB: "docker-registry1.mariadb.com/library/mariadb:11.4.3"
rebooter: "cr.eu-north1.nebius.cloud/soperator/rebooter:1.17.0"
57 changes: 57 additions & 0 deletions helm/soperator-crds/templates/slurmcluster-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4031,6 +4031,63 @@ spec:
description: PriorityClass defines the priority class for
the Slurm worker node
type: string
rebooter:
description: |-
It's alpha feature and will be moved to separate CRD in the future
Rebooter defines the configuration for the Slurm worker node rebooter
properties:
enabled:
default: false
description: enabled defines whether the rebooter is enabled
type: boolean
evictionMethod:
default: evict
description: |-
evictionMethod defines the method of eviction for the Slurm worker node
Must be one of [drain, evict]. Now only evict is supported
enum:
- evict
type: string
image:
description: Image defines the rebooter container image
type: string
imagePullPolicy:
default: IfNotPresent
description: imagePullPolicy defines the image pull policy
enum:
- Always
- Never
- IfNotPresent
type: string
logLevel:
default: info
description: logLevel defines the log level for the rebooter
enum:
- debug
- info
- warn
- error
type: string
namespace:
description: |-
Namespace defines the namespace where the rebooter will be deployed
By default, the same namespace as the soperator
type: string
resources:
additionalProperties:
anyOf:
- type: integer
- type: string
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
description: Resources defines the [corev1.ResourceRequirements]
for the container
type: object
serviceAccountName:
description: serviceAccountName defines the service account
name for the rebooter
type: string
type: object
size:
description: Size defines the number of node instances
format: int32
Expand Down
Loading
Loading