Skip to content

Commit

Permalink
add option to normalize switch names in graph (#78)
Browse files Browse the repository at this point in the history
Signed-off-by: Dmitry Shmulevich <[email protected]>
  • Loading branch information
dmitsh authored Feb 24, 2025
1 parent a78e2e4 commit 592bc19
Show file tree
Hide file tree
Showing 8 changed files with 236 additions and 121 deletions.
2 changes: 1 addition & 1 deletion pkg/providers/aws/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ func (p *baseProvider) GenerateTopologyConfig(ctx context.Context, pageSize *int

klog.Infof("Extracted topology for %d instances", topo.Len())

return topo.ToThreeTierGraph(NAME, instances)
return topo.ToThreeTierGraph(NAME, instances, false)
}

type Provider struct {
Expand Down
2 changes: 1 addition & 1 deletion pkg/providers/gcp/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ func (p *baseProvider) GenerateTopologyConfig(ctx context.Context, pageSize *int
return nil, err
}

return topo.ToThreeTierGraph(NAME, instances)
return topo.ToThreeTierGraph(NAME, instances, false)
}

type Provider struct {
Expand Down
50 changes: 0 additions & 50 deletions pkg/providers/oci/instance_topology.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ import (
"context"
"fmt"
"net/http"
"sort"
"time"

"github.com/oracle/oci-go-sdk/v65/core"
Expand All @@ -30,14 +29,6 @@ import (
"github.com/NVIDIA/topograph/pkg/topology"
)

type level int

const (
localBlockLevel level = iota + 1
networkBlockLevel
hpcIslandLevel
)

func GenerateInstanceTopology(ctx context.Context, factory ClientFactory, cis []topology.ComputeInstances) (*topology.ClusterTopology, error) {
topo := topology.NewClusterTopology()

Expand All @@ -47,47 +38,6 @@ func GenerateInstanceTopology(ctx context.Context, factory ClientFactory, cis []
}
}

// sort by network hierarchy
sort.Slice(topo.Instances, func(i, j int) bool {
if topo.Instances[i].DatacenterID != topo.Instances[j].DatacenterID {
return topo.Instances[i].DatacenterID < topo.Instances[j].DatacenterID
}

if topo.Instances[i].SpineID != topo.Instances[j].SpineID {
return topo.Instances[i].SpineID < topo.Instances[j].SpineID
}

if topo.Instances[i].BlockID != topo.Instances[j].BlockID {
return topo.Instances[i].BlockID < topo.Instances[j].BlockID
}

return topo.Instances[i].InstanceID < topo.Instances[j].InstanceID
})

// assign switch names
levelSwitchCount := map[level]int{localBlockLevel: 0, networkBlockLevel: 0, hpcIslandLevel: 0}
switches := make(map[string]struct{})
for i, inst := range topo.Instances {
_, ok := switches[inst.BlockID]
if !ok {
levelSwitchCount[localBlockLevel]++
topo.Instances[i].BlockName = fmt.Sprintf("Switch.%d.%d", localBlockLevel, levelSwitchCount[localBlockLevel])
switches[inst.BlockID] = struct{}{}
}
_, ok = switches[inst.SpineID]
if !ok {
levelSwitchCount[networkBlockLevel]++
topo.Instances[i].SpineName = fmt.Sprintf("Switch.%d.%d", networkBlockLevel, levelSwitchCount[networkBlockLevel])
switches[inst.SpineID] = struct{}{}
}
_, ok = switches[inst.DatacenterID]
if !ok {
levelSwitchCount[hpcIslandLevel]++
topo.Instances[i].DatacenterName = fmt.Sprintf("Switch.%d.%d", hpcIslandLevel, levelSwitchCount[hpcIslandLevel])
switches[inst.SpineID] = struct{}{}
}
}

return topo, nil
}

Expand Down
2 changes: 1 addition & 1 deletion pkg/providers/oci/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ func (p *Provider) GenerateTopologyConfig(ctx context.Context, _ *int, instances
return nil, err
}

return topo.ToThreeTierGraph(NAME, instances)
return topo.ToThreeTierGraph(NAME, instances, true)
}

// Engine support
Expand Down
17 changes: 9 additions & 8 deletions pkg/topology/block.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ import (
"sort"
)

// DomainMap maps domain name to a map of hostnames
type DomainMap map[string]map[string]struct{}
// DomainMap maps domain name to a map of hostname:instance
type DomainMap map[string]map[string]string

func NewDomainMap() DomainMap {
return make(DomainMap)
Expand All @@ -40,8 +40,9 @@ func (m DomainMap) ToBlocks() *Vertex {
sort.Strings(domainNames)

for i, domainName := range domainNames {
nodes := make([]string, 0, len(m[domainName]))
for node := range m[domainName] {
nodeMap := m[domainName]
nodes := make([]string, 0, len(nodeMap))
for node := range nodeMap {
nodes = append(nodes, node)
}
sort.Strings(nodes)
Expand All @@ -55,7 +56,7 @@ func (m DomainMap) ToBlocks() *Vertex {
for _, node := range nodes {
vertex.Vertices[node] = &Vertex{
Name: node,
ID: node,
ID: nodeMap[node],
}
}

Expand All @@ -65,11 +66,11 @@ func (m DomainMap) ToBlocks() *Vertex {
return blockRoot
}

func (m DomainMap) AddHost(domain, host string) {
func (m DomainMap) AddHost(domain, instance, host string) {
d, ok := m[domain]
if !ok {
m[domain] = make(map[string]struct{})
m[domain] = make(map[string]string)
d = m[domain]
}
d[host] = struct{}{}
d[host] = instance
}
20 changes: 10 additions & 10 deletions pkg/topology/block_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,15 @@ func TestToBlocks(t *testing.T) {
},
{
name: "Case 2: one block",
domainMap: DomainMap{"domain1": {"host1": struct{}{}, "host2": struct{}{}}},
domainMap: DomainMap{"domain1": {"host1": "instance1", "host2": "instance2"}},
blocks: &Vertex{
Vertices: map[string]*Vertex{
"domain1": {
Name: "domain1",
ID: "block001",
Vertices: map[string]*Vertex{
"host1": {ID: "host1", Name: "host1"},
"host2": {ID: "host2", Name: "host2"},
"host1": {ID: "instance1", Name: "host1"},
"host2": {ID: "instance2", Name: "host2"},
},
},
},
Expand All @@ -53,24 +53,24 @@ func TestToBlocks(t *testing.T) {
{
name: "Case 3: two blocks",
domainMap: DomainMap{
"domain1": {"host1": struct{}{}, "host2": struct{}{}},
"domain2": {"host3": struct{}{}},
"domain1": {"host1": "instance1", "host2": "instance2"},
"domain2": {"host3": "instance3"},
},
blocks: &Vertex{
Vertices: map[string]*Vertex{
"domain1": {
Name: "domain1",
ID: "block001",
Vertices: map[string]*Vertex{
"host1": {ID: "host1", Name: "host1"},
"host2": {ID: "host2", Name: "host2"},
"host1": {ID: "instance1", Name: "host1"},
"host2": {ID: "instance2", Name: "host2"},
},
},
"domain2": {
Name: "domain2",
ID: "block002",
Vertices: map[string]*Vertex{
"host3": {ID: "host3", Name: "host3"},
"host3": {ID: "instance3", Name: "host3"},
},
},
},
Expand All @@ -82,8 +82,8 @@ func TestToBlocks(t *testing.T) {
t.Run(tc.name, func(t *testing.T) {
domainMap := NewDomainMap()
for domainName, domain := range tc.domainMap {
for hostname := range domain {
domainMap.AddHost(domainName, hostname)
for hostname, instance := range domain {
domainMap.AddHost(domainName, instance, hostname)
}
}
require.Equal(t, tc.blocks, domainMap.ToBlocks())
Expand Down
69 changes: 67 additions & 2 deletions pkg/topology/graph.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,22 @@
package topology

import (
"fmt"
"sort"

"k8s.io/klog/v2"

"github.com/NVIDIA/topograph/pkg/metrics"
)

type band int

const (
blockBand band = iota + 1
spineBand
datacenterBand
)

type ClusterTopology struct {
Instances []*InstanceTopology
}
Expand Down Expand Up @@ -50,7 +61,7 @@ func (c *ClusterTopology) Len() int {
return len(c.Instances)
}

func (c *ClusterTopology) ToThreeTierGraph(provider string, cis []ComputeInstances) (*Vertex, error) {
func (c *ClusterTopology) ToThreeTierGraph(provider string, cis []ComputeInstances, normalize bool) (*Vertex, error) {
i2n := make(map[string]string)
for _, ci := range cis {
for instance, node := range ci.Instances {
Expand All @@ -62,6 +73,10 @@ func (c *ClusterTopology) ToThreeTierGraph(provider string, cis []ComputeInstanc
nodes := make(map[string]*Vertex)
domainMap := NewDomainMap()

if normalize {
c.Normalize()
}

for _, inst := range c.Instances {
nodeName, ok := i2n[inst.InstanceID]
if !ok {
Expand All @@ -77,7 +92,7 @@ func (c *ClusterTopology) ToThreeTierGraph(provider string, cis []ComputeInstanc
}

if len(inst.AcceleratorID) != 0 {
domainMap.AddHost(inst.AcceleratorID, nodeName)
domainMap.AddHost(inst.AcceleratorID, inst.InstanceID, nodeName)
}

swNames := [3]string{inst.BlockName, inst.SpineName, inst.DatacenterName}
Expand Down Expand Up @@ -136,3 +151,53 @@ func (c *ClusterTopology) ToThreeTierGraph(provider string, cis []ComputeInstanc

return root, nil
}

func (c *ClusterTopology) Normalize() {
// sort by network hierarchy
sort.Slice(c.Instances, func(i, j int) bool {
if c.Instances[i].DatacenterID != c.Instances[j].DatacenterID {
return c.Instances[i].DatacenterID < c.Instances[j].DatacenterID
}

if c.Instances[i].SpineID != c.Instances[j].SpineID {
return c.Instances[i].SpineID < c.Instances[j].SpineID
}

if c.Instances[i].BlockID != c.Instances[j].BlockID {
return c.Instances[i].BlockID < c.Instances[j].BlockID
}

return c.Instances[i].InstanceID < c.Instances[j].InstanceID
})

// normalize switch names
bandCounts := map[band]int{blockBand: 0, spineBand: 0, datacenterBand: 0}

switches := make(map[string]string)
for i, inst := range c.Instances {
name, ok := switches[inst.BlockID]
if !ok {
bandCounts[blockBand]++
c.Instances[i].BlockName = fmt.Sprintf("switch.%d.%d", blockBand, bandCounts[blockBand])
switches[inst.BlockID] = name
} else {
c.Instances[i].BlockName = name
}
name, ok = switches[inst.SpineID]
if !ok {
bandCounts[spineBand]++
c.Instances[i].SpineName = fmt.Sprintf("switch.%d.%d", spineBand, bandCounts[spineBand])
switches[inst.SpineID] = name
} else {
c.Instances[i].SpineName = name
}
name, ok = switches[inst.DatacenterID]
if !ok {
bandCounts[datacenterBand]++
c.Instances[i].DatacenterName = fmt.Sprintf("switch.%d.%d", datacenterBand, bandCounts[datacenterBand])
switches[inst.SpineID] = name
} else {
c.Instances[i].DatacenterName = name
}
}
}
Loading

0 comments on commit 592bc19

Please sign in to comment.