Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MSP-3645: add task and jobs params into slurm.conf #271

Merged
merged 3 commits into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions api/v1/slurmcluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ type SlurmClusterSpec struct {
// SlurmConfig represents the Slurm configuration in slurm.conf. Not all options are supported.
//
// +kubebuilder:validation:Optional
// +kubebuilder:default={defMemPerNode: 1228800, defCpuPerGPU: 16, completeWait: 5, debugFlags: "Cgroup,CPU_Bind,Gres,JobComp,Priority,Script,SelectType,Steps,TraceJobs", taskPluginParam: "Verbose", maxJobCount: 10000, minJobAge: 86400}
SlurmConfig SlurmConfig `json:"slurmConfig,omitempty"`
}

Expand All @@ -103,10 +104,22 @@ type SlurmConfig struct {
// +kubebuilder:default="Cgroup,CPU_Bind,Gres,JobComp,Priority,Script,SelectType,Steps,TraceJobs"
// +kubebuilder:validation:Pattern="^((Accrue|Agent|AuditRPCs|Backfill|BackfillMap|BurstBuffer|Cgroup|ConMgr|CPU_Bind|CpuFrequency|Data|DBD_Agent|Dependency|Elasticsearch|Energy|Federation|FrontEnd|Gres|Hetjob|Gang|GLOB_SILENCE|JobAccountGather|JobComp|JobContainer|License|Network|NetworkRaw|NodeFeatures|NO_CONF_HASH|Power|Priority|Profile|Protocol|Reservation|Route|Script|SelectType|Steps|Switch|TLS|TraceJobs|Triggers)(,)?)+$"
DebugFlags string `json:"debugFlags,omitempty"`
// Additional parameters for the task plugin
//
// +kubebuilder:validation:Optional
// +kubebuilder:default="Verbose"
// +kubebuilder:validation:Pattern="^((None|Cores|Sockets|Threads|SlurmdOffSpec|OOMKillStep|Verbose|Autobind)(,)?)+$"
TaskPluginParam string `json:"taskPluginParam,omitempty"`
// Keep N last jobs in controller memory
//
// +kubebuilder:validation:Optional
// +kubebuilder:default=10000
MaxJobCount int32 `json:"maxJobCount,omitempty"`
// Don't remove jobs from controller memory after some time
//
// +kubebuilder:validation:Optional
// +kubebuilder:default=86400
MinJobAge int32 `json:"minJobAge,omitempty"`
}

type PartitionConfiguration struct {
Expand Down
27 changes: 27 additions & 0 deletions config/crd/bases/slurm.nebius.ai_slurmclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1470,6 +1470,15 @@ spec:
type: string
type: object
slurmConfig:
default:
completeWait: 5
debugFlags: Cgroup,CPU_Bind,Gres,JobComp,Priority,Script,SelectType,Steps,TraceJobs
defCpuPerGPU: 16
defMemPerNode: 1228800
maxJobCount: 10000
minJobAge: 86400
taskPlugin: task/cgroup,task/affinity
taskPluginParam: Verbose
description: SlurmConfig represents the Slurm configuration in slurm.conf.
Not all options are supported.
properties:
Expand All @@ -1496,8 +1505,26 @@ spec:
node in mebibytes.
format: int32
type: integer
maxJobCount:
default: 10000
description: Keep N last jobs in controller memory
format: int32
type: integer
minJobAge:
default: 86400
description: Don't remove jobs from controller memory after some
time
format: int32
type: integer
taskPlugin:
default: task/cgroup,task/affinity
description: Identifies the type of task launch plugin (e.g. pinning
tasks to specific processors)
pattern: ^((task/affinity|task/cgroup|task/none)(,)?)+$
type: string
taskPluginParam:
default: Verbose
description: Additional parameters for the task plugin
pattern: ^((None|Cores|Sockets|Threads|SlurmdOffSpec|OOMKillStep|Verbose|Autobind)(,)?)+$
type: string
type: object
Expand Down
4 changes: 4 additions & 0 deletions helm/slurm-cluster/templates/slurm-cluster-cr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ metadata:
{{- . | toYaml | nindent 4 }}
{{- end }}
spec:
{{- if .Values.slurmConfig }}
slurmConfig:
{{- toYaml .Values.slurmConfig | nindent 4 }}
{{- end }}
crVersion: {{ .Chart.Version }}
pause: {{ .Values.pause }}
clusterType: {{ .Values.clusterType }}
Expand Down
8 changes: 8 additions & 0 deletions helm/slurm-cluster/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,14 @@ periodicChecks:
k8sNodeFilterName: "no-gpu"
imagePullPolicy: "IfNotPresent"
appArmorProfile: "unconfined"
slurmConfig: {}
# defMemPerNode: 1228800
# defCpuPerGPU: 16
# completeWait: 5
# debugFlags: "Cgroup,CPU_Bind,Gres,JobComp,Priority,Script,SelectType,Steps,TraceJobs"
# taskPluginParam: "Verbose"
# maxJobCount: 10000
# minJobAge: 86400
slurmNodes:
accounting:
enabled: false
Expand Down
27 changes: 27 additions & 0 deletions helm/soperator-crds/templates/slurmcluster-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1469,6 +1469,15 @@ spec:
type: string
type: object
slurmConfig:
default:
completeWait: 5
debugFlags: Cgroup,CPU_Bind,Gres,JobComp,Priority,Script,SelectType,Steps,TraceJobs
defCpuPerGPU: 16
defMemPerNode: 1228800
maxJobCount: 10000
minJobAge: 86400
taskPlugin: task/cgroup,task/affinity
taskPluginParam: Verbose
description: SlurmConfig represents the Slurm configuration in slurm.conf.
Not all options are supported.
properties:
Expand All @@ -1495,8 +1504,26 @@ spec:
node in mebibytes.
format: int32
type: integer
maxJobCount:
default: 10000
description: Keep N last jobs in controller memory
format: int32
type: integer
minJobAge:
default: 86400
description: Don't remove jobs from controller memory after some
time
format: int32
type: integer
taskPlugin:
default: task/cgroup,task/affinity
description: Identifies the type of task launch plugin (e.g. pinning
tasks to specific processors)
pattern: ^((task/affinity|task/cgroup|task/none)(,)?)+$
type: string
taskPluginParam:
default: Verbose
description: Additional parameters for the task plugin
pattern: ^((None|Cores|Sockets|Threads|SlurmdOffSpec|OOMKillStep|Verbose|Autobind)(,)?)+$
type: string
type: object
Expand Down
27 changes: 27 additions & 0 deletions helm/soperator/crds/slurmcluster-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1469,6 +1469,15 @@ spec:
type: string
type: object
slurmConfig:
default:
completeWait: 5
debugFlags: Cgroup,CPU_Bind,Gres,JobComp,Priority,Script,SelectType,Steps,TraceJobs
defCpuPerGPU: 16
defMemPerNode: 1228800
maxJobCount: 10000
minJobAge: 86400
taskPlugin: task/cgroup,task/affinity
taskPluginParam: Verbose
description: SlurmConfig represents the Slurm configuration in slurm.conf.
Not all options are supported.
properties:
Expand All @@ -1495,8 +1504,26 @@ spec:
node in mebibytes.
format: int32
type: integer
maxJobCount:
default: 10000
description: Keep N last jobs in controller memory
format: int32
type: integer
minJobAge:
default: 86400
description: Don't remove jobs from controller memory after some
time
format: int32
type: integer
taskPlugin:
default: task/cgroup,task/affinity
description: Identifies the type of task launch plugin (e.g. pinning
tasks to specific processors)
pattern: ^((task/affinity|task/cgroup|task/none)(,)?)+$
type: string
taskPluginParam:
default: Verbose
description: Additional parameters for the task plugin
pattern: ^((None|Cores|Sockets|Threads|SlurmdOffSpec|OOMKillStep|Verbose|Autobind)(,)?)+$
type: string
type: object
Expand Down
2 changes: 2 additions & 0 deletions images/worker/supervisord_entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ if [ -n "${CGROUP_V2}" ]; then
if [ -n "${CGROUP_PATH}" ]; then
echo "cgroup v2 detected, creating cgroup for ${CGROUP_PATH}"
mkdir -p /sys/fs/cgroup/${CGROUP_PATH}/../system.slice
# TODO: uncomment this line when 24.11 will be tested. It is OOMKillStep for taskPluginParam
# echo "1" > /sys/fs/cgroup/${CGROUP_PATH}/../system.slice/memory.oom.group
else
echo "cgroup v2 detected, but cgroup path is empty"
exit 1
Expand Down
4 changes: 0 additions & 4 deletions internal/render/common/configmap.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,6 @@ func generateSlurmConfig(cluster *values.SlurmCluster) renderutils.ConfigFile {
res.AddProperty("LaunchParameters", "use_interactive_step")
res.AddComment("Scrontab")
res.AddProperty("ScronParameters", "enable,explicit_scancel")
res.AddComment("")
res.AddProperty("MaxJobCount", 1000) // Keep 1000 last jobs in controller memory
res.AddProperty("MinJobAge", 86400) // Don't remove jobs from controller memory after some time
res.AddComment("")
res.AddProperty("PropagateResourceLimits", "NONE") // Don't propagate ulimits from the login node by default
res.AddComment("")
res.AddComment("HEALTH CHECKS")
Expand Down
Loading