Skip to content

Commit

Permalink
Merge pull request #297 from nebius/MSP-3308
Browse files Browse the repository at this point in the history
MSP-3308: delete unused NCCLTypeH100GPUCluster topology
  • Loading branch information
Uburro authored Jan 3, 2025
2 parents f8aeece + 11012c5 commit 156647e
Show file tree
Hide file tree
Showing 8 changed files with 6 additions and 89 deletions.
2 changes: 1 addition & 1 deletion api/v1/slurmcluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ type NCCLSettings struct {

// TopologyType define type of NCCL GPU topology
//
// +kubebuilder:validation:Enum="H100 GPU cluster";auto;custom
// +kubebuilder:validation:Enum=auto;custom
// +kubebuilder:validation:Optional
// +kubebuilder:default="auto"
TopologyType string `json:"topologyType,omitempty"`
Expand Down
1 change: 0 additions & 1 deletion config/crd/bases/slurm.nebius.ai_slurmclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1075,7 +1075,6 @@ spec:
default: auto
description: TopologyType define type of NCCL GPU topology
enum:
- H100 GPU cluster
- auto
- custom
type: string
Expand Down
1 change: 0 additions & 1 deletion config/crd/slurm.nebius.ai_slurmclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,6 @@ spec:
default: auto
description: TopologyType define type of NCCL GPU topology
enum:
- H100 GPU cluster
- auto
- custom
type: string
Expand Down
2 changes: 1 addition & 1 deletion helm/slurm-cluster/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ populateJail:
# volumeSourceName: "jail-snapshot"
overwrite: false
ncclSettings:
# TopologyType define type of NCCL GPU topology Enum: H100 GPU cluster, auto, custom
# TopologyType define type of NCCL GPU topology Enum: auto, custom
topologyType: "auto"
# TopologyData defines NCCL GPU topology
topologyData: ""
Expand Down
1 change: 0 additions & 1 deletion helm/soperator-crds/templates/slurmcluster-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1074,7 +1074,6 @@ spec:
default: auto
description: TopologyType define type of NCCL GPU topology
enum:
- H100 GPU cluster
- auto
- custom
type: string
Expand Down
1 change: 0 additions & 1 deletion helm/soperator/crds/slurmcluster-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1074,7 +1074,6 @@ spec:
default: auto
description: TopologyType define type of NCCL GPU topology
enum:
- H100 GPU cluster
- auto
- custom
type: string
Expand Down
10 changes: 4 additions & 6 deletions internal/consts/nccl_topology_type.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,13 @@ func (b baseNCCLType) String() string {
}

var (
NCCLTypeAuto NCCLType = baseNCCLType{"auto"}
NCCLTypeH100GPUCluster NCCLType = baseNCCLType{"H100 GPU cluster"}
NCCLTypeCustom NCCLType = baseNCCLType{"custom"}
NCCLTypeAuto NCCLType = baseNCCLType{"auto"}
NCCLTypeCustom NCCLType = baseNCCLType{"custom"}
)

var ncclTypeMap = map[string]NCCLType{
"auto": NCCLTypeAuto,
"H100 GPU cluster": NCCLTypeH100GPUCluster,
"custom": NCCLTypeCustom,
"auto": NCCLTypeAuto,
"custom": NCCLTypeCustom,
}

func StringToNCCLType(s string) (NCCLType, error) {
Expand Down
77 changes: 0 additions & 77 deletions internal/render/worker/configmap.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,6 @@ func generateVirtualTopology(ncclType consts.NCCLType, topologyData string) (ren
switch ncclType {
case consts.NCCLTypeAuto:
return res, nil
case consts.NCCLTypeH100GPUCluster:
return generateVirtualH100GPUClusterTopology(), nil
case consts.NCCLTypeCustom:
if topologyData != "" {
return renderutils.NewAsIsConfig(topologyData), nil
Expand All @@ -58,81 +56,6 @@ func generateVirtualTopology(ncclType consts.NCCLType, topologyData string) (ren
}
}

func generateVirtualH100GPUClusterTopology() renderutils.ConfigFile {
res := &renderutils.MultilineStringConfig{}
res.AddLine("<system version=\"1\">")
res.AddLine(" <cpu numaid=\"0\" affinity=\"00000000,00000000,0000ffff,ffffffff,ffffffff\" arch=\"x86_64\" vendor=\"GenuineIntel\" familyid=\"6\" modelid=\"106\">")
res.AddLine(" <pci busid=\"0000:8a:00.0\" class=\"0x060400\" vendor=\"0x104c\" device=\"0x8232\" subsystem_vendor=\"0x0000\" subsystem_device=\"0x0000\" link_speed=\"32.0 GT/s PCIe\" link_width=\"16\">")
res.AddLine(" <pci busid=\"0000:8c:00.0\" class=\"0x020700\" vendor=\"0x15b3\" device=\"0x101e\" subsystem_vendor=\"0x15b3\" subsystem_device=\"0x0023\" link_speed=\"32.0 GT/s PCIe\" link_width=\"16\">")
res.AddLine(" <nic>")
res.AddLine(" <net name=\"mlx5_4\" dev=\"4\" speed=\"400000\" port=\"1\" latency=\"0.000000\" maxconn=\"131072\" gdr=\"1\" coll=\"1\"/>")
res.AddLine(" </nic>")
res.AddLine(" </pci>")
res.AddLine(" <pci busid=\"0000:8d:00.0\" class=\"0x030200\" vendor=\"0x10de\" device=\"0x2330\" subsystem_vendor=\"0x10de\" subsystem_device=\"0x16c1\" link_speed=\"32.0 GT/s PCIe\" link_width=\"16\"/>")
res.AddLine(" </pci>")
res.AddLine(" <pci busid=\"0000:8e:00.0\" class=\"0x060400\" vendor=\"0x104c\" device=\"0x8232\" subsystem_vendor=\"0x0000\" subsystem_device=\"0x0000\" link_speed=\"32.0 GT/s PCIe\" link_width=\"16\">")
res.AddLine(" <pci busid=\"0000:90:00.0\" class=\"0x020700\" vendor=\"0x15b3\" device=\"0x101e\" subsystem_vendor=\"0x15b3\" subsystem_device=\"0x0023\" link_speed=\"32.0 GT/s PCIe\" link_width=\"16\">")
res.AddLine(" <nic>")
res.AddLine(" <net name=\"mlx5_5\" dev=\"5\" speed=\"400000\" port=\"1\" latency=\"0.000000\" maxconn=\"131072\" gdr=\"1\" coll=\"1\"/>")
res.AddLine(" </nic>")
res.AddLine(" </pci>")
res.AddLine(" <pci busid=\"0000:91:00.0\" class=\"0x030200\" vendor=\"0x10de\" device=\"0x2330\" subsystem_vendor=\"0x10de\" subsystem_device=\"0x16c1\" link_speed=\"32.0 GT/s PCIe\" link_width=\"16\"/>")
res.AddLine(" </pci>")
res.AddLine(" <pci busid=\"0000:92:00.0\" class=\"0x060400\" vendor=\"0x104c\" device=\"0x8232\" subsystem_vendor=\"0x0000\" subsystem_device=\"0x0000\" link_speed=\"32.0 GT/s PCIe\" link_width=\"16\">")
res.AddLine(" <pci busid=\"0000:94:00.0\" class=\"0x020700\" vendor=\"0x15b3\" device=\"0x101e\" subsystem_vendor=\"0x15b3\" subsystem_device=\"0x0023\" link_speed=\"32.0 GT/s PCIe\" link_width=\"16\">")
res.AddLine(" <nic>")
res.AddLine(" <net name=\"mlx5_6\" dev=\"6\" speed=\"400000\" port=\"1\" latency=\"0.000000\" maxconn=\"131072\" gdr=\"1\" coll=\"1\"/>")
res.AddLine(" </nic>")
res.AddLine(" </pci>")
res.AddLine(" <pci busid=\"0000:95:00.0\" class=\"0x030200\" vendor=\"0x10de\" device=\"0x2330\" subsystem_vendor=\"0x10de\" subsystem_device=\"0x16c1\" link_speed=\"32.0 GT/s PCIe\" link_width=\"16\"/>")
res.AddLine(" </pci>")
res.AddLine(" <pci busid=\"0000:96:00.0\" class=\"0x060400\" vendor=\"0x104c\" device=\"0x8232\" subsystem_vendor=\"0x0000\" subsystem_device=\"0x0000\" link_speed=\"32.0 GT/s PCIe\" link_width=\"16\">")
res.AddLine(" <pci busid=\"0000:98:00.0\" class=\"0x020700\" vendor=\"0x15b3\" device=\"0x101e\" subsystem_vendor=\"0x15b3\" subsystem_device=\"0x0023\" link_speed=\"32.0 GT/s PCIe\" link_width=\"16\">")
res.AddLine(" <nic>")
res.AddLine(" <net name=\"mlx5_7\" dev=\"7\" speed=\"400000\" port=\"1\" latency=\"0.000000\" maxconn=\"131072\" gdr=\"1\" coll=\"1\"/>")
res.AddLine(" </nic>")
res.AddLine(" </pci>")
res.AddLine(" <pci busid=\"0000:99:00.0\" class=\"0x030200\" vendor=\"0x10de\" device=\"0x2330\" subsystem_vendor=\"0x10de\" subsystem_device=\"0x16c1\" link_speed=\"32.0 GT/s PCIe\" link_width=\"16\"/>")
res.AddLine(" </pci>")
res.AddLine(" </cpu>")
res.AddLine(" <cpu numaid=\"1\" affinity=\"ffffffff,ffffffff,ffff0000,00000000,00000000\" arch=\"x86_64\" vendor=\"GenuineIntel\" familyid=\"6\" modelid=\"106\">")
res.AddLine(" <pci busid=\"0000:a8:00.0\" class=\"0x060400\" vendor=\"0x104c\" device=\"0x8232\" subsystem_vendor=\"0x0000\" subsystem_device=\"0x0000\" link_speed=\"32.0 GT/s PCIe\" link_width=\"16\">")
res.AddLine(" <pci busid=\"0000:aa:00.0\" class=\"0x020700\" vendor=\"0x15b3\" device=\"0x101e\" subsystem_vendor=\"0x15b3\" subsystem_device=\"0x0023\" link_speed=\"32.0 GT/s PCIe\" link_width=\"16\">")
res.AddLine(" <nic>")
res.AddLine(" <net name=\"mlx5_0\" dev=\"0\" speed=\"400000\" port=\"1\" latency=\"0.000000\" maxconn=\"131072\" gdr=\"1\" coll=\"1\"/>")
res.AddLine(" </nic>")
res.AddLine(" </pci>")
res.AddLine(" <pci busid=\"0000:ab:00.0\" class=\"0x030200\" vendor=\"0x10de\" device=\"0x2330\" subsystem_vendor=\"0x10de\" subsystem_device=\"0x16c1\" link_speed=\"32.0 GT/s PCIe\" link_width=\"16\"/>")
res.AddLine(" </pci>")
res.AddLine(" <pci busid=\"0000:ac:00.0\" class=\"0x060400\" vendor=\"0x104c\" device=\"0x8232\" subsystem_vendor=\"0x0000\" subsystem_device=\"0x0000\" link_speed=\"32.0 GT/s PCIe\" link_width=\"16\">")
res.AddLine(" <pci busid=\"0000:ae:00.0\" class=\"0x020700\" vendor=\"0x15b3\" device=\"0x101e\" subsystem_vendor=\"0x15b3\" subsystem_device=\"0x0023\" link_speed=\"32.0 GT/s PCIe\" link_width=\"16\">")
res.AddLine(" <nic>")
res.AddLine(" <net name=\"mlx5_1\" dev=\"1\" speed=\"400000\" port=\"1\" latency=\"0.000000\" maxconn=\"131072\" gdr=\"1\" coll=\"1\"/>")
res.AddLine(" </nic>")
res.AddLine(" </pci>")
res.AddLine(" <pci busid=\"0000:af:00.0\" class=\"0x030200\" vendor=\"0x10de\" device=\"0x2330\" subsystem_vendor=\"0x10de\" subsystem_device=\"0x16c1\" link_speed=\"32.0 GT/s PCIe\" link_width=\"16\"/>")
res.AddLine(" </pci>")
res.AddLine(" <pci busid=\"0000:b0:00.0\" class=\"0x060400\" vendor=\"0x104c\" device=\"0x8232\" subsystem_vendor=\"0x0000\" subsystem_device=\"0x0000\" link_speed=\"32.0 GT/s PCIe\" link_width=\"16\">")
res.AddLine(" <pci busid=\"0000:b2:00.0\" class=\"0x020700\" vendor=\"0x15b3\" device=\"0x101e\" subsystem_vendor=\"0x15b3\" subsystem_device=\"0x0023\" link_speed=\"32.0 GT/s PCIe\" link_width=\"16\">")
res.AddLine(" <nic>")
res.AddLine(" <net name=\"mlx5_2\" dev=\"2\" speed=\"400000\" port=\"1\" latency=\"0.000000\" maxconn=\"131072\" gdr=\"1\" coll=\"1\"/>")
res.AddLine(" </nic>")
res.AddLine(" </pci>")
res.AddLine(" <pci busid=\"0000:b3:00.0\" class=\"0x030200\" vendor=\"0x10de\" device=\"0x2330\" subsystem_vendor=\"0x10de\" subsystem_device=\"0x16c1\" link_speed=\"32.0 GT/s PCIe\" link_width=\"16\"/>")
res.AddLine(" </pci>")
res.AddLine(" <pci busid=\"0000:b4:00.0\" class=\"0x060400\" vendor=\"0x104c\" device=\"0x8232\" subsystem_vendor=\"0x0000\" subsystem_device=\"0x0000\" link_speed=\"32.0 GT/s PCIe\" link_width=\"16\">")
res.AddLine(" <pci busid=\"0000:b6:00.0\" class=\"0x020700\" vendor=\"0x15b3\" device=\"0x101e\" subsystem_vendor=\"0x15b3\" subsystem_device=\"0x0023\" link_speed=\"32.0 GT/s PCIe\" link_width=\"16\">")
res.AddLine(" <nic>")
res.AddLine(" <net name=\"mlx5_3\" dev=\"3\" speed=\"400000\" port=\"1\" latency=\"0.000000\" maxconn=\"131072\" gdr=\"1\" coll=\"1\"/>")
res.AddLine(" </nic>")
res.AddLine(" </pci>")
res.AddLine(" <pci busid=\"0000:b7:00.0\" class=\"0x030200\" vendor=\"0x10de\" device=\"0x2330\" subsystem_vendor=\"0x10de\" subsystem_device=\"0x16c1\" link_speed=\"32.0 GT/s PCIe\" link_width=\"16\"/>")
res.AddLine(" </pci>")
res.AddLine(" </cpu>")
res.AddLine("</system>")
return res
}

// endregion NCCL topology

// region Sysctl
Expand Down

0 comments on commit 156647e

Please sign in to comment.