Skip to content

Commit

Permalink
Merge pull request #298 from nebius/MSP-3875
Browse files Browse the repository at this point in the history
MSP-3875: add maintenance mode
  • Loading branch information
Uburro authored Jan 7, 2025
2 parents 74841e3 + a132f0b commit 38dc927
Show file tree
Hide file tree
Showing 32 changed files with 250 additions and 78 deletions.
14 changes: 10 additions & 4 deletions api/v1/slurmcluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"k8s.io/apimachinery/pkg/api/meta"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"nebius.ai/slurm-operator/internal/consts"

mariadbv1alpha1 "github.com/mariadb-operator/mariadb-operator/api/v1alpha1"
prometheusv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
Expand All @@ -22,12 +23,17 @@ type SlurmClusterSpec struct {
// +kubebuilder:validation:Optional
// +kubebuilder:default="gpu"
ClusterType string `json:"clusterType,omitempty"`

// Pause defines whether to gracefully stop the cluster.
// Setting it to false after cluster has been paused starts the cluster back
// Maintenance defines the maintenance window for the cluster.
// It can have the following values:
// - none: No maintenance is performed. The cluster operates normally.
// - downscale: Scales down all components to 0.
// - downscaleAndDeletePopulateJail: Scales down all components to 0 and deletes the kubernetes Kind Jobs populateJail.
// - skipPopulateJail: Skips the execution of the populateJail job during maintenance.
//
// +kubebuilder:validation:Optional
Pause bool `json:"pause,omitempty"` // TODO cluster pausing/resuming
// +kubebuilder:validation:Enum=none;downscale;downscaleAndDeletePopulateJail;skipPopulateJail
// +kubebuilder:default="none"
Maintenance *consts.MaintenanceMode `json:"maintenance,omitempty"`

// NCCLSettings
// +kubebuilder:validation:Optional
Expand Down
6 changes: 6 additions & 0 deletions api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 15 additions & 5 deletions config/crd/bases/slurm.nebius.ai_slurmclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1065,6 +1065,21 @@ spec:
type: object
minItems: 1
type: array
maintenance:
default: none
description: |-
Maintenance defines the maintenance window for the cluster.
It can have the following values:
- none: No maintenance is performed. The cluster operates normally.
- downscale: Scales down all components to 0.
- downscaleAndDeletePopulateJail: Scales down all components to 0 and deletes the kubernetes Kind Jobs populateJail.
- skipPopulateJail: Skips the execution of the populateJail job during maintenance.
enum:
- none
- downscale
- downscaleAndDeletePopulateJail
- skipPopulateJail
type: string
ncclSettings:
description: NCCLSettings
properties:
Expand Down Expand Up @@ -1101,11 +1116,6 @@ spec:
type: string
type: array
type: object
pause:
description: |-
Pause defines whether to gracefully stop the cluster.
Setting it to false after cluster has been paused starts the cluster back
type: boolean
periodicChecks:
description: PeriodicChecks define the k8s CronJobs performing cluster
checks
Expand Down
1 change: 1 addition & 0 deletions helm/slurm-cluster/templates/slurm-cluster-cr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ spec:
slurmConfig:
{{- toYaml .Values.slurmConfig | nindent 4 }}
{{- end }}
maintenance: {{ default "none" .Values.maintenance | quote }}
crVersion: {{ .Chart.Version }}
useDefaultAppArmorProfile: {{ .Values.useDefaultAppArmorProfile }}
pause: {{ .Values.pause }}
Expand Down
9 changes: 7 additions & 2 deletions helm/slurm-cluster/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,13 @@ clusterName: "slurm1"
annotations: {}
# Add appArmor profile to the cluster
useDefaultAppArmorProfile: true
# Whether to gracefully stop the cluster. Setting it to false after cluster has been paused starts the cluster back
pause: false
# Maintenance defines the maintenance window for the cluster.
# It can have the following values:
# - none: No maintenance is performed. The cluster operates normally.
# - downscale: Scales down all components to 0.
# - downscaleAndDeletePopulateJail: Scales down all components to 0 and deletes the kubernetes Kind Jobs populateJail.
# - skipPopulateJail: Skips the execution of the populateJail job during maintenance.
maintenance: "none"
# Slurm cluster type. Can be now gpu or cpu
clusterType: gpu
# partitionConfiguration define partition configuration of slurm worker nodes
Expand Down
20 changes: 15 additions & 5 deletions helm/soperator-crds/templates/slurmcluster-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1064,6 +1064,21 @@ spec:
type: object
minItems: 1
type: array
maintenance:
default: none
description: |-
Maintenance defines the maintenance window for the cluster.
It can have the following values:
- none: No maintenance is performed. The cluster operates normally.
- downscale: Scales down all components to 0.
- downscaleAndDeletePopulateJail: Scales down all components to 0 and deletes the kubernetes Kind Jobs populateJail.
- skipPopulateJail: Skips the execution of the populateJail job during maintenance.
enum:
- none
- downscale
- downscaleAndDeletePopulateJail
- skipPopulateJail
type: string
ncclSettings:
description: NCCLSettings
properties:
Expand Down Expand Up @@ -1100,11 +1115,6 @@ spec:
type: string
type: array
type: object
pause:
description: |-
Pause defines whether to gracefully stop the cluster.
Setting it to false after cluster has been paused starts the cluster back
type: boolean
periodicChecks:
description: PeriodicChecks define the k8s CronJobs performing cluster
checks
Expand Down
20 changes: 15 additions & 5 deletions helm/soperator/crds/slurmcluster-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1064,6 +1064,21 @@ spec:
type: object
minItems: 1
type: array
maintenance:
default: none
description: |-
Maintenance defines the maintenance window for the cluster.
It can have the following values:
- none: No maintenance is performed. The cluster operates normally.
- downscale: Scales down all components to 0.
- downscaleAndDeletePopulateJail: Scales down all components to 0 and deletes the kubernetes Kind Jobs populateJail.
- skipPopulateJail: Skips the execution of the populateJail job during maintenance.
enum:
- none
- downscale
- downscaleAndDeletePopulateJail
- skipPopulateJail
type: string
ncclSettings:
description: NCCLSettings
properties:
Expand Down Expand Up @@ -1100,11 +1115,6 @@ spec:
type: string
type: array
type: object
pause:
description: |-
Pause defines whether to gracefully stop the cluster.
Setting it to false after cluster has been paused starts the cluster back
type: boolean
periodicChecks:
description: PeriodicChecks define the k8s CronJobs performing cluster
checks
Expand Down
15 changes: 15 additions & 0 deletions internal/check/maintanence.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package check

import "nebius.ai/slurm-operator/internal/consts"

func IsMaintenanceActive(maintenance *consts.MaintenanceMode) bool {
return maintenance != nil && *maintenance != consts.ModeNone
}

func IsModeDownscaleAndDeletePopulate(maintenance *consts.MaintenanceMode) bool {
return maintenance != nil && *maintenance == consts.ModeDownscaleAndDeletePopulate
}

func IsModeSkipPopulateJail(maintenance *consts.MaintenanceMode) bool {
return maintenance != nil && *maintenance == consts.ModeSkipPopulateJail
}
15 changes: 15 additions & 0 deletions internal/consts/maintenance.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package consts

type MaintenanceMode string

const (
ModeNone MaintenanceMode = "none"
ModeDownscale MaintenanceMode = "downscale"
ModeDownscaleAndDeletePopulate MaintenanceMode = "downscaleAndDeletePopulateJail"
ModeSkipPopulateJail MaintenanceMode = "skipPopulateJail"
)

const (
ZeroReplicas = int32(0)
SingleReplicas = int32(1)
)
69 changes: 44 additions & 25 deletions internal/controller/clustercontroller/populate_job.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/log"

slurmv1 "nebius.ai/slurm-operator/api/v1"
"nebius.ai/slurm-operator/internal/check"
"nebius.ai/slurm-operator/internal/logfield"
"nebius.ai/slurm-operator/internal/render/populate_jail"
"nebius.ai/slurm-operator/internal/utils"
Expand All @@ -36,42 +37,60 @@ func (r SlurmClusterReconciler) ReconcilePopulateJail(
stepLogger := log.FromContext(stepCtx)
stepLogger.Info("Reconciling")

isMaintenanceStopMode := check.IsModeDownscaleAndDeletePopulate(clusterValues.PopulateJail.Maintenance)
desired := batchv1.Job{}
if getErr := r.Get(stepCtx,
getErr := r.Get(stepCtx,
client.ObjectKey{
Namespace: clusterValues.Namespace,
Name: clusterValues.PopulateJail.Name,
},
&desired,
); getErr != nil {
if !apierrors.IsNotFound(getErr) {
stepLogger.Error(getErr, "Failed to get")
return errors.Wrap(getErr, "getting Populate jail Job")
)
if getErr == nil {
stepLogger.Info("Already exists")
if isMaintenanceStopMode {
stepLogger.Info("Deleting")
if err := r.Delete(stepCtx, &desired); err != nil {
stepLogger.Error(err, "Failed to delete")
return errors.Wrap(err, "deleting Populate jail Job")
}
stepLogger.Info("Deleted")
}
return nil
}

renderedDesired, err := populate_jail.RenderPopulateJailJob(
clusterValues.Namespace,
clusterValues.Name,
clusterValues.ClusterType,
clusterValues.NodeFilters,
clusterValues.VolumeSources,
&clusterValues.PopulateJail,
)
if err != nil {
stepLogger.Error(err, "Failed to render")
return errors.Wrap(err, "rendering Populate jail Job")
}
desired = *renderedDesired.DeepCopy()
if !apierrors.IsNotFound(getErr) && !isMaintenanceStopMode {
stepLogger.Error(getErr, "Failed to get")
return errors.Wrap(getErr, "getting Populate jail Job")
}

stepLogger = stepLogger.WithValues(logfield.ResourceKV(&desired)...)
stepLogger.Info("Rendered")
if isMaintenanceStopMode {
stepLogger.Info("Skipping creation due to MaintenanceStopMode")
return nil
}

if err = r.Job.Reconcile(stepCtx, cluster, &desired); err != nil {
stepLogger.Error(err, "Failed to reconcile")
return errors.Wrap(err, "reconciling Populate jail Job")
}
stepLogger.Info("Reconciled")
renderedDesired, err := populate_jail.RenderPopulateJailJob(
clusterValues.Namespace,
clusterValues.Name,
clusterValues.ClusterType,
clusterValues.NodeFilters,
clusterValues.VolumeSources,
&clusterValues.PopulateJail,
)
if err != nil {
stepLogger.Error(err, "Failed to render")
return errors.Wrap(err, "rendering Populate jail Job")
}
desired = *renderedDesired.DeepCopy()

stepLogger = stepLogger.WithValues(logfield.ResourceKV(&desired)...)
stepLogger.Info("Rendered")

if err = r.Job.Reconcile(stepCtx, cluster, &desired); err != nil {
stepLogger.Error(err, "Failed to reconcile")
return errors.Wrap(err, "reconciling Populate jail Job")
}
stepLogger.Info("Reconciled")

if pollErr := wait.PollUntilContextCancel(stepCtx,
10*time.Second,
Expand Down
6 changes: 4 additions & 2 deletions internal/controller/clustercontroller/reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -226,8 +226,10 @@ func (r *SlurmClusterReconciler) reconcile(ctx context.Context, cluster *slurmv1
res, err := r.runWithPhase(ctx, cluster,
ptr.To(slurmv1.PhaseClusterReconciling),
func() (ctrl.Result, error) {
if err = r.ReconcilePopulateJail(ctx, clusterValues, cluster); err != nil {
return ctrl.Result{}, err
if !check.IsModeSkipPopulateJail(clusterValues.PopulateJail.Maintenance) {
if err = r.ReconcilePopulateJail(ctx, clusterValues, cluster); err != nil {
return ctrl.Result{}, err
}
}
if err = r.ReconcileCommon(ctx, cluster, clusterValues); err != nil {
return ctrl.Result{}, err
Expand Down
1 change: 1 addition & 0 deletions internal/controller/clustercontroller/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ func (r SlurmClusterReconciler) ReconcileWorkers(
clusterValues.Name,
clusterValues.NodeWorker.K8sNodeFilterName,
clusterValues.NodeFilters,
clusterValues.NodeWorker.Maintenance,
)
stepLogger = stepLogger.WithValues(logfield.ResourceKV(&desired)...)
stepLogger.Info("Rendered")
Expand Down
10 changes: 9 additions & 1 deletion internal/render/accounting/deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@ package accounting
import (
appsv1 "k8s.io/api/apps/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/ptr"

slurmv1 "nebius.ai/slurm-operator/api/v1"
"nebius.ai/slurm-operator/internal/check"
"nebius.ai/slurm-operator/internal/consts"
"nebius.ai/slurm-operator/internal/naming"
"nebius.ai/slurm-operator/internal/render/common"
Expand Down Expand Up @@ -32,6 +34,12 @@ func RenderDeployment(
return nil, err
}

replicas := &accounting.Deployment.Replicas

if check.IsMaintenanceActive(accounting.Maintenance) {
replicas = ptr.To(consts.ZeroReplicas)
}

return &appsv1.Deployment{
ObjectMeta: metav1.ObjectMeta{
Name: naming.BuildDeploymentName(consts.ComponentTypeAccounting),
Expand All @@ -41,7 +49,7 @@ func RenderDeployment(
Spec: appsv1.DeploymentSpec{
// in Deployment mode replicas should be 1.
// Because of accounting requires a single instance.
Replicas: &accounting.Deployment.Replicas,
Replicas: replicas,
Strategy: appsv1.DeploymentStrategy{
Type: appsv1.RecreateDeploymentStrategyType,
},
Expand Down
5 changes: 5 additions & 0 deletions internal/render/accounting/mariadb.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"k8s.io/utils/ptr"

slurmv1 "nebius.ai/slurm-operator/api/v1"
"nebius.ai/slurm-operator/internal/check"
"nebius.ai/slurm-operator/internal/consts"
"nebius.ai/slurm-operator/internal/naming"
"nebius.ai/slurm-operator/internal/render/common"
Expand All @@ -32,6 +33,10 @@ func RenderMariaDb(
labels := common.RenderLabels(consts.ComponentTypeMariaDbOperator, clusterName)
port, replicas, antiAffinityEnabled := getMariaDbConfig(mariaDb)

if check.IsMaintenanceActive(accounting.Maintenance) {
replicas = consts.ZeroReplicas
}

nodeFilter, err := utils.GetBy(
nodeFilters,
accounting.K8sNodeFilterName,
Expand Down
2 changes: 1 addition & 1 deletion internal/render/benchmark/cronjob.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ func RenderNCCLBenchmarkCronJob(
Spec: batchv1.JobSpec{
Parallelism: ptr.To(int32(1)),
Completions: ptr.To(int32(1)),
BackoffLimit: ptr.To(int32(0)),
BackoffLimit: ptr.To(consts.ZeroReplicas),
Template: corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: labels,
Expand Down
Loading

0 comments on commit 38dc927

Please sign in to comment.