From aa9c65433483e3ea42371141bb39e2db4d43f777 Mon Sep 17 00:00:00 2001 From: Seth Hoenig Date: Wed, 13 Dec 2023 15:44:50 +0000 Subject: [PATCH 1/5] numalib: provide a fallback for topology scanning on linux --- client/lib/numalib/detect_default.go | 41 +----------------------- client/lib/numalib/detect_generic.go | 47 ++++++++++++++++++++++++++++ client/lib/numalib/detect_linux.go | 44 ++++++++++++++++++++++++++ 3 files changed, 92 insertions(+), 40 deletions(-) create mode 100644 client/lib/numalib/detect_generic.go diff --git a/client/lib/numalib/detect_default.go b/client/lib/numalib/detect_default.go index 479d6d85855..3f2f7b400f8 100644 --- a/client/lib/numalib/detect_default.go +++ b/client/lib/numalib/detect_default.go @@ -5,15 +5,6 @@ package numalib -import ( - "context" - "time" - - "github.com/hashicorp/nomad/client/lib/idset" - "github.com/hashicorp/nomad/client/lib/numalib/hw" - "github.com/shirou/gopsutil/v3/cpu" -) - // PlatformScanners returns the set of SystemScanner for systems without a // specific implementation. func PlatformScanners() []SystemScanner { @@ -22,40 +13,10 @@ func PlatformScanners() []SystemScanner { } } -const ( - nodeID = hw.NodeID(0) - socketID = hw.SocketID(0) - maxSpeed = hw.KHz(0) -) - // Generic implements SystemScanner as a fallback for operating systems without // a specific implementation. type Generic struct{} func (g *Generic) ScanSystem(top *Topology) { - // hardware may or may not be NUMA, but for now we only - // detect such topology on linux systems - top.NodeIDs = idset.Empty[hw.NodeID]() - top.NodeIDs.Insert(nodeID) - - // cores - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - count, err := cpu.CountsWithContext(ctx, true) - if err != nil { - return - } - top.Cores = make([]Core, count) - - infos, err := cpu.InfoWithContext(ctx) - if err != nil || len(infos) == 0 { - return - } - - for i := 0; i < count; i++ { - info := infos[0] - speed := hw.KHz(hw.MHz(info.Mhz) * 1000) - top.insert(nodeID, socketID, hw.CoreID(i), Performance, maxSpeed, speed) - } + scanGeneric(top) } diff --git a/client/lib/numalib/detect_generic.go b/client/lib/numalib/detect_generic.go new file mode 100644 index 00000000000..d6653c20e56 --- /dev/null +++ b/client/lib/numalib/detect_generic.go @@ -0,0 +1,47 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +package numalib + +import ( + "context" + "time" + + "github.com/hashicorp/nomad/client/lib/idset" + "github.com/hashicorp/nomad/client/lib/numalib/hw" + "github.com/shirou/gopsutil/v3/cpu" +) + +const ( + nodeID = hw.NodeID(0) + socketID = hw.SocketID(0) + maxSpeed = hw.KHz(0) +) + +func scanGeneric(top *Topology) { + // hardware may or may not be NUMA, but for now we only + // detect such topology on linux systems + top.NodeIDs = idset.Empty[hw.NodeID]() + top.NodeIDs.Insert(nodeID) + + // cores + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + count, err := cpu.CountsWithContext(ctx, true) + if err != nil { + return + } + top.Cores = make([]Core, count) + + infos, err := cpu.InfoWithContext(ctx) + if err != nil || len(infos) == 0 { + return + } + + for i := 0; i < count; i++ { + info := infos[0] + speed := hw.KHz(hw.MHz(info.Mhz) * 1000) + top.insert(nodeID, socketID, hw.CoreID(i), Performance, maxSpeed, speed) + } +} diff --git a/client/lib/numalib/detect_linux.go b/client/lib/numalib/detect_linux.go index 7e78427e70c..08bfab6fff1 100644 --- a/client/lib/numalib/detect_linux.go +++ b/client/lib/numalib/detect_linux.go @@ -23,6 +23,7 @@ func PlatformScanners() []SystemScanner { new(Smbios), new(Cgroups1), new(Cgroups2), + new(Fallback), } } @@ -209,3 +210,46 @@ func scanIDs(top *Topology, content string) { } } } + +// Fallback detects if the NUMA aware topology scanning was unable to construct +// a valid model of the system. This will be common on Nomad clients running in +// containers, erroneous hypervisors, or without root. +type Fallback struct{} + +func (s *Fallback) ScanSystem(top *Topology) { + broken := false + + switch { + case top.NodeIDs.Empty(): + broken = true + break + case len(top.Distances) <= 0: + broken = true + break + case top.NumCores() <= 0: + broken = true + break + case top.TotalCompute() <= 0: + broken = true + break + case top.UsableCompute() <= 0: + broken = true + break + case top.UsableCores().Empty(): + broken = true + break + } + + if !broken { + return + } + + // we have a broken topology; reset it and fallback to the generic scanner + // basically treating this client like a windows / unsupported OS + top.NodeIDs = nil + top.Distances = nil + top.Cores = nil + + // invoke the generic scanner + scanGeneric(top) +} From 2a2c2105a8e1b1db2d4a2aa77f46ca9924b28dc6 Mon Sep 17 00:00:00 2001 From: Seth Hoenig Date: Wed, 13 Dec 2023 15:49:08 +0000 Subject: [PATCH 2/5] numalib: better package var names --- client/lib/numalib/detect_generic.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/client/lib/numalib/detect_generic.go b/client/lib/numalib/detect_generic.go index d6653c20e56..1a69c626020 100644 --- a/client/lib/numalib/detect_generic.go +++ b/client/lib/numalib/detect_generic.go @@ -13,16 +13,16 @@ import ( ) const ( - nodeID = hw.NodeID(0) - socketID = hw.SocketID(0) - maxSpeed = hw.KHz(0) + genericNodeID = hw.NodeID(0) + genericSocketID = hw.SocketID(0) + genericMaxSpeed = hw.KHz(0) ) func scanGeneric(top *Topology) { // hardware may or may not be NUMA, but for now we only // detect such topology on linux systems top.NodeIDs = idset.Empty[hw.NodeID]() - top.NodeIDs.Insert(nodeID) + top.NodeIDs.Insert(genericNodeID) // cores ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) @@ -42,6 +42,6 @@ func scanGeneric(top *Topology) { for i := 0; i < count; i++ { info := infos[0] speed := hw.KHz(hw.MHz(info.Mhz) * 1000) - top.insert(nodeID, socketID, hw.CoreID(i), Performance, maxSpeed, speed) + top.insert(genericNodeID, genericSocketID, hw.CoreID(i), Performance, genericMaxSpeed, speed) } } From 92fe71fde0f9b808fd0f9b9594c09aed1310fe1b Mon Sep 17 00:00:00 2001 From: Seth Hoenig Date: Wed, 13 Dec 2023 15:50:42 +0000 Subject: [PATCH 3/5] cl: add cl --- .changelog/19457.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .changelog/19457.txt diff --git a/.changelog/19457.txt b/.changelog/19457.txt new file mode 100644 index 00000000000..91020913992 --- /dev/null +++ b/.changelog/19457.txt @@ -0,0 +1,3 @@ +```release-note:bug +client: enable a fallback for linux clients unable to detect cpu topology +``` From f760c4a9a33d50142de6478e42662e126a3161d3 Mon Sep 17 00:00:00 2001 From: Seth Hoenig Date: Wed, 13 Dec 2023 15:57:52 +0000 Subject: [PATCH 4/5] lint: fix my sloppy code --- client/lib/numalib/detect_linux.go | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/client/lib/numalib/detect_linux.go b/client/lib/numalib/detect_linux.go index 08bfab6fff1..3d33ef46171 100644 --- a/client/lib/numalib/detect_linux.go +++ b/client/lib/numalib/detect_linux.go @@ -222,22 +222,16 @@ func (s *Fallback) ScanSystem(top *Topology) { switch { case top.NodeIDs.Empty(): broken = true - break - case len(top.Distances) <= 0: + case len(top.Distances) == 0: broken = true - break case top.NumCores() <= 0: broken = true - break case top.TotalCompute() <= 0: broken = true - break case top.UsableCompute() <= 0: broken = true - break case top.UsableCores().Empty(): broken = true - break } if !broken { From b427a975f605713bafafcf747555521869294242 Mon Sep 17 00:00:00 2001 From: Seth Hoenig Date: Wed, 13 Dec 2023 16:36:09 +0000 Subject: [PATCH 5/5] cl: fixup wording --- .changelog/19457.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.changelog/19457.txt b/.changelog/19457.txt index 91020913992..2eb71787c64 100644 --- a/.changelog/19457.txt +++ b/.changelog/19457.txt @@ -1,3 +1,3 @@ ```release-note:bug -client: enable a fallback for linux clients unable to detect cpu topology +client: Fixed a bug where clients are unable to detect CPU topology in certain conditions ```