Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

numalib: provide a fallback for topology scanning on linux #19457

Merged
merged 5 commits into from
Dec 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/19457.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
client: Fixed a bug where clients are unable to detect CPU topology in certain conditions
```
41 changes: 1 addition & 40 deletions client/lib/numalib/detect_default.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,6 @@

package numalib

import (
"context"
"time"

"github.com/hashicorp/nomad/client/lib/idset"
"github.com/hashicorp/nomad/client/lib/numalib/hw"
"github.com/shirou/gopsutil/v3/cpu"
)

// PlatformScanners returns the set of SystemScanner for systems without a
// specific implementation.
func PlatformScanners() []SystemScanner {
Expand All @@ -22,40 +13,10 @@ func PlatformScanners() []SystemScanner {
}
}

const (
nodeID = hw.NodeID(0)
socketID = hw.SocketID(0)
maxSpeed = hw.KHz(0)
)

// Generic implements SystemScanner as a fallback for operating systems without
// a specific implementation.
type Generic struct{}

func (g *Generic) ScanSystem(top *Topology) {
// hardware may or may not be NUMA, but for now we only
// detect such topology on linux systems
top.NodeIDs = idset.Empty[hw.NodeID]()
top.NodeIDs.Insert(nodeID)

// cores
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()

count, err := cpu.CountsWithContext(ctx, true)
if err != nil {
return
}
top.Cores = make([]Core, count)

infos, err := cpu.InfoWithContext(ctx)
if err != nil || len(infos) == 0 {
return
}

for i := 0; i < count; i++ {
info := infos[0]
speed := hw.KHz(hw.MHz(info.Mhz) * 1000)
top.insert(nodeID, socketID, hw.CoreID(i), Performance, maxSpeed, speed)
}
scanGeneric(top)
}
47 changes: 47 additions & 0 deletions client/lib/numalib/detect_generic.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1

package numalib

import (
"context"
"time"

"github.com/hashicorp/nomad/client/lib/idset"
"github.com/hashicorp/nomad/client/lib/numalib/hw"
"github.com/shirou/gopsutil/v3/cpu"
)

const (
genericNodeID = hw.NodeID(0)
genericSocketID = hw.SocketID(0)
genericMaxSpeed = hw.KHz(0)
)

func scanGeneric(top *Topology) {
// hardware may or may not be NUMA, but for now we only
// detect such topology on linux systems
top.NodeIDs = idset.Empty[hw.NodeID]()
top.NodeIDs.Insert(genericNodeID)

// cores
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()

count, err := cpu.CountsWithContext(ctx, true)
if err != nil {
return
}
top.Cores = make([]Core, count)

infos, err := cpu.InfoWithContext(ctx)
if err != nil || len(infos) == 0 {
return
}

for i := 0; i < count; i++ {
info := infos[0]
speed := hw.KHz(hw.MHz(info.Mhz) * 1000)
top.insert(genericNodeID, genericSocketID, hw.CoreID(i), Performance, genericMaxSpeed, speed)
}
}
38 changes: 38 additions & 0 deletions client/lib/numalib/detect_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ func PlatformScanners() []SystemScanner {
new(Smbios),
new(Cgroups1),
new(Cgroups2),
new(Fallback),
}
}

Expand Down Expand Up @@ -209,3 +210,40 @@ func scanIDs(top *Topology, content string) {
}
}
}

// Fallback detects if the NUMA aware topology scanning was unable to construct
// a valid model of the system. This will be common on Nomad clients running in
// containers, erroneous hypervisors, or without root.
type Fallback struct{}

func (s *Fallback) ScanSystem(top *Topology) {
broken := false

switch {
case top.NodeIDs.Empty():
broken = true
case len(top.Distances) == 0:
broken = true
case top.NumCores() <= 0:
broken = true
case top.TotalCompute() <= 0:
broken = true
case top.UsableCompute() <= 0:
broken = true
case top.UsableCores().Empty():
broken = true
}

if !broken {
return
}

// we have a broken topology; reset it and fallback to the generic scanner
// basically treating this client like a windows / unsupported OS
top.NodeIDs = nil
top.Distances = nil
top.Cores = nil

// invoke the generic scanner
scanGeneric(top)
}