diff --git a/src/control/SConscript b/src/control/SConscript index 485d2700d18..54ce6dd8b9d 100644 --- a/src/control/SConscript +++ b/src/control/SConscript @@ -15,7 +15,7 @@ def is_firmware_mgmt_build(benv): def get_build_tags(benv): "Get custom go build tags." - tags = ["ucx", "spdk"] + tags = ["spdk"] if is_firmware_mgmt_build(benv): tags.append("firmware") if not is_release_build(benv): @@ -124,7 +124,6 @@ def scons(): denv.Tool('go_builder') - denv.require('ofi', 'ucx') # Sets CGO_LDFLAGS for rpath options denv.d_add_rpaths("..", True, True) denv.AppendENVPath("CGO_CFLAGS", denv.subst("$_CPPINCFLAGS"), sep=" ") diff --git a/src/control/cmd/daos_agent/start.go b/src/control/cmd/daos_agent/start.go index a16c9c68010..40b2e01610c 100644 --- a/src/control/cmd/daos_agent/start.go +++ b/src/control/cmd/daos_agent/start.go @@ -73,14 +73,6 @@ func (cmd *startCmd) Execute(_ []string) error { } cmd.Debugf("created dRPC server: %s", time.Since(createDrpcStart)) - hwprovInitStart := time.Now() - hwprovFini, err := hwprov.Init(cmd.Logger) - if err != nil { - return err - } - defer hwprovFini() - cmd.Debugf("initialized hardware providers: %s", time.Since(hwprovInitStart)) - cacheStart := time.Now() cache := NewInfoCache(ctx, cmd.Logger, cmd.ctlInvoker, cmd.cfg) if cmd.attachInfoCacheDisabled() { diff --git a/src/control/lib/daos/status.go b/src/control/lib/daos/status.go index 96f7cf1b313..e40f71555c9 100644 --- a/src/control/lib/daos/status.go +++ b/src/control/lib/daos/status.go @@ -105,6 +105,8 @@ const ( BadTarget Status = -C.DER_BAD_TARGET // GroupVersionMismatch indicates that group versions didn't match GroupVersionMismatch Status = -C.DER_GRPVER + // MercuryFatalError indicates a fatal (non-retryable) Mercury error + MercuryFatalError Status = -C.DER_HG_FATAL // NoService indicates the pool service is not up and didn't process the pool request NoService Status = -C.DER_NO_SERVICE ) diff --git a/src/control/lib/hardware/cart/bindings.go b/src/control/lib/hardware/cart/bindings.go new file mode 100644 index 00000000000..d52c15842d1 --- /dev/null +++ b/src/control/lib/hardware/cart/bindings.go @@ -0,0 +1,58 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package cart + +/* +#cgo LDFLAGS: -lcart + +#include +#include +*/ +import "C" + +import ( + "unsafe" + + "github.com/pkg/errors" + + "github.com/daos-stack/daos/src/control/lib/daos" + "github.com/daos-stack/daos/src/control/logging" +) + +func getProtocolInfo(log logging.Logger, provider string) ([]*crtFabricDevice, error) { + var cInfo *C.struct_crt_protocol_info + var cProtoStr *C.char + if provider != "" { + log.Debugf("getting fabric protocol info from CART for %q", provider) + cProtoStr = C.CString(provider) + defer C.free(unsafe.Pointer(cProtoStr)) + } else { + log.Debug("getting all fabric protocol info from CART") + } + + if err := daos.Status(C.crt_protocol_info_get(cProtoStr, &cInfo)); err != daos.Success { + return nil, errors.Wrap(err, "crt_hg_get_protocol_info") + } + defer C.crt_protocol_info_free(cInfo) + + infoList := make([]*crtFabricDevice, 0) + + for cur := cInfo; cur != nil; cur = cur.next { + infoList = append(infoList, cToCrtProtocolInfo(cur)) + } + + log.Debugf("CART protocol info discovered:\n%+v", infoList) + return infoList, nil +} + +func cToCrtProtocolInfo(cInfo *C.struct_crt_protocol_info) *crtFabricDevice { + return &crtFabricDevice{ + Class: C.GoString(cInfo.class_name), + Protocol: C.GoString(cInfo.protocol_name), + Device: C.GoString(cInfo.device_name), + } +} diff --git a/src/control/lib/hardware/cart/cart.go b/src/control/lib/hardware/cart/cart.go new file mode 100644 index 00000000000..f174fc06aff --- /dev/null +++ b/src/control/lib/hardware/cart/cart.go @@ -0,0 +1,158 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package cart + +import ( + "context" + "fmt" + "strings" + + "github.com/pkg/errors" + + "github.com/daos-stack/daos/src/control/lib/daos" + "github.com/daos-stack/daos/src/control/lib/hardware" + "github.com/daos-stack/daos/src/control/logging" +) + +const ( + classLibFabric = "ofi" + classUCX = "ucx" + classNA = "na" +) + +// crtFabricDevice is a single fabric device discovered by CART. +type crtFabricDevice struct { + Class string `json:"class"` + Protocol string `json:"protocol"` + Device string `json:"device"` +} + +// isUCX indicates whether this is a UCX device. +func (cfd *crtFabricDevice) IsUCX() bool { + return cfd.Class == classUCX +} + +// OSName returns the OS level network device name for this device. +func (cfd *crtFabricDevice) OSName() string { + if cfd.IsUCX() { + return getOSNameFromUCXDevice(cfd.Device) + } + return cfd.Device +} + +// ProviderName returns the DAOS fabric provider name for this device's protocol. +func (cfd *crtFabricDevice) ProviderName() string { + return fmt.Sprintf("%s+%s", cfd.Class, cfd.Protocol) +} + +type getProtocolFn func(log logging.Logger, provider string) ([]*crtFabricDevice, error) + +// Provider provides access to the CART API. +type Provider struct { + log logging.Logger + getProtocolInfo getProtocolFn +} + +// NewProvider creates a new CART Provider. +func NewProvider(log logging.Logger) *Provider { + return &Provider{ + log: log, + } +} + +// GetFabricInterfaces fetches information about the system fabric interfaces via CART. +func (p *Provider) GetFabricInterfaces(ctx context.Context, provider string) (*hardware.FabricInterfaceSet, error) { + if p == nil { + return nil, errors.New("nil CART Provider") + } + + ch := make(chan *fabricResult) + go p.getFabricInterfaces(provider, ch) + select { + case <-ctx.Done(): + return nil, ctx.Err() + case result := <-ch: + return result.fiSet, result.err + } +} + +type fabricResult struct { + fiSet *hardware.FabricInterfaceSet + err error +} + +type providerPriorities map[string]int + +func (p providerPriorities) getPriority(provName string) int { + prio, ok := p[provName] + if !ok { + prio = len(p) + p[provName] = prio + } + return prio +} + +func (p *Provider) getFabricInterfaces(provider string, ch chan *fabricResult) { + if p.getProtocolInfo == nil { + p.getProtocolInfo = getProtocolInfo + } + + devices, err := p.getProtocolInfo(p.log, provider) + if err != nil { + // TODO DAOS-15588: Remove this special handling for verbs once the + // underlying Mercury bug is fixed. + // Currently requesting verbs on a system without Infiniband results in + // a Mercury error. + if errors.Is(err, daos.MercuryFatalError) && strings.HasSuffix(provider, "verbs") { + ch <- &fabricResult{ + fiSet: hardware.NewFabricInterfaceSet(), + } + return + } + + provMsg := "" + if provider != "" { + provMsg = fmt.Sprintf(" for provider %q", provider) + } + ch <- &fabricResult{ + err: errors.Wrapf(err, "fetching fabric interfaces%s", provMsg), + } + return + } + + fis := hardware.NewFabricInterfaceSet() + priorities := make(providerPriorities) + for _, dev := range devices { + fis.Update(crtFabricDeviceToFabricInterface(dev, priorities)) + } + + ch <- &fabricResult{ + fiSet: fis, + } +} + +func crtFabricDeviceToFabricInterface(dev *crtFabricDevice, priorities providerPriorities) *hardware.FabricInterface { + return &hardware.FabricInterface{ + Name: dev.Device, + OSName: dev.OSName(), + Providers: getProviderSet(dev, priorities), + } +} + +// getProviderSet returns a set of one or more DAOS providers associated with the protocol info. +func getProviderSet(dev *crtFabricDevice, priorities providerPriorities) *hardware.FabricProviderSet { + if dev.IsUCX() { + // UCX determines its own priorities within the provider set + return getProviderSetFromUCXTransport(dev.Protocol) + } + + name := dev.ProviderName() + return hardware.NewFabricProviderSet(&hardware.FabricProvider{ + Name: name, + Priority: priorities.getPriority(name), + }) +} diff --git a/src/control/lib/hardware/cart/cart_test.go b/src/control/lib/hardware/cart/cart_test.go new file mode 100644 index 00000000000..a2574bde985 --- /dev/null +++ b/src/control/lib/hardware/cart/cart_test.go @@ -0,0 +1,278 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package cart + +import ( + "context" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/pkg/errors" + + "github.com/daos-stack/daos/src/control/common/test" + "github.com/daos-stack/daos/src/control/lib/daos" + "github.com/daos-stack/daos/src/control/lib/hardware" + "github.com/daos-stack/daos/src/control/logging" +) + +func TestCart_Provider_GetFabricInterfaces_Integrated(t *testing.T) { + for name, tc := range map[string]struct { + in string + expErr error + }{ + "all": {}, + "all tcp": { + in: "tcp", + }, + "ofi+tcp": { + in: "ofi+tcp", + }, + "ucx+tcp": { + in: "ucx+tcp", + }, + "all verbs": { + in: "verbs", + }, + "ofi+verbs": { + in: "ofi+verbs", + }, + "ucx+rc": { + in: "ucx+rc_v", + }, + "garbage": { + in: "blahblahblah", + expErr: daos.MercuryError, + }, + } { + t.Run(name, func(t *testing.T) { + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + + p := NewProvider(log) + + result, err := p.GetFabricInterfaces(test.Context(t), tc.in) + + test.CmpErr(t, tc.expErr, err) + + if err == nil { + t.Logf("Results:\n%+v", result) + } + }) + } +} + +func TestCart_Provider_GetFabricInterfaces(t *testing.T) { + for name, tc := range map[string]struct { + nilProvider bool + getContext func(*testing.T) context.Context + getProtocolInfo getProtocolFn + wantProv string + expResult *hardware.FabricInterfaceSet + expErr error + }{ + "nil": { + nilProvider: true, + expErr: errors.New("nil"), + }, + "context canceled": { + getContext: func(t *testing.T) context.Context { + ctx, cancel := context.WithCancel(test.Context(t)) + cancel() + return ctx + }, + getProtocolInfo: func(log logging.Logger, provider string) ([]*crtFabricDevice, error) { + time.Sleep(1 * time.Second) // a delay to ensure canceled context is noticed first + return nil, errors.New("should not get here") + }, + expErr: errors.New("context canceled"), + }, + "getProtocolInfo fails": { + getProtocolInfo: func(_ logging.Logger, _ string) ([]*crtFabricDevice, error) { + return nil, errors.New("mock failure") + }, + expErr: errors.New("fetching fabric interfaces: mock failure"), + }, + "getProtocolInfo fails with specific provider": { + getProtocolInfo: func(_ logging.Logger, provider string) ([]*crtFabricDevice, error) { + if provider != "ofi+verbs" { + return nil, errors.Errorf("FAIL: wrong provider %q passed in", provider) + } + return nil, errors.New("mock failure") + }, + wantProv: "ofi+verbs", + expErr: errors.New("fetching fabric interfaces for provider \"ofi+verbs\": mock failure"), + }, + "no fabric interfaces": { + getProtocolInfo: func(_ logging.Logger, _ string) ([]*crtFabricDevice, error) { + return []*crtFabricDevice{}, nil + }, + expResult: hardware.NewFabricInterfaceSet(), + }, + "success": { + getProtocolInfo: func(_ logging.Logger, _ string) ([]*crtFabricDevice, error) { + return []*crtFabricDevice{ + { + Class: classLibFabric, + Protocol: "verbs", + Device: "test0", + }, + { + Class: classUCX, + Protocol: "rc_verbs", + Device: "test0:1", + }, + { + Class: classUCX, + Protocol: "ud_verbs", + Device: "test0:1", + }, + { + Class: classLibFabric, + Protocol: "tcp", + Device: "test1", + }, + { + Class: classUCX, + Protocol: "tcp", + Device: "test1", + }, + { + Class: classNA, + Protocol: "shm", + Device: "shm", + }, + }, nil + }, + expResult: hardware.NewFabricInterfaceSet( + &hardware.FabricInterface{ + Name: "test0", + OSName: "test0", + Providers: hardware.NewFabricProviderSet( + &hardware.FabricProvider{ + Name: "ofi+verbs", + }, + ), + }, + &hardware.FabricInterface{ + Name: "test0:1", + OSName: "test0", + Providers: hardware.NewFabricProviderSet( + &hardware.FabricProvider{ + Name: "ucx+rc_v", + }, + &hardware.FabricProvider{ + Name: "ucx+ud_v", + }, + &hardware.FabricProvider{ + Name: "ucx+rc", + }, + &hardware.FabricProvider{ + Name: "ucx+ud", + }, + &hardware.FabricProvider{ + Name: "ucx+all", + Priority: ucxCatchallPriority, + }, + ), + }, + &hardware.FabricInterface{ + Name: "test1", + OSName: "test1", + Providers: hardware.NewFabricProviderSet( + &hardware.FabricProvider{ + Name: "ofi+tcp", + Priority: 1, + }, + &hardware.FabricProvider{ + Name: "ucx+tcp", + Priority: ucxTCPPriority, + }, + &hardware.FabricProvider{ + Name: "ucx+all", + Priority: ucxCatchallPriority, + }, + ), + }, + &hardware.FabricInterface{ + Name: "shm", + OSName: "shm", + Providers: hardware.NewFabricProviderSet( + &hardware.FabricProvider{ + Name: "na+shm", + Priority: 2, + }, + ), + }, + ), + }, + "success with provider": { + getProtocolInfo: func(_ logging.Logger, provider string) ([]*crtFabricDevice, error) { + if provider != "ofi+tcp" { + return nil, errors.Errorf("FAIL: wrong provider %q passed in", provider) + } + return []*crtFabricDevice{ + { + Class: classLibFabric, + Protocol: "tcp", + Device: "test0", + }, + { + Class: classLibFabric, + Protocol: "tcp", + Device: "test1", + }, + }, nil + }, + wantProv: "ofi+tcp", + expResult: hardware.NewFabricInterfaceSet( + &hardware.FabricInterface{ + Name: "test0", + OSName: "test0", + Providers: hardware.NewFabricProviderSet( + &hardware.FabricProvider{ + Name: "ofi+tcp", + }, + ), + }, + &hardware.FabricInterface{ + Name: "test1", + OSName: "test1", + Providers: hardware.NewFabricProviderSet( + &hardware.FabricProvider{ + Name: "ofi+tcp", + }, + ), + }, + ), + }, + } { + t.Run(name, func(t *testing.T) { + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + + var p *Provider + if !tc.nilProvider { + p = NewProvider(log) + p.getProtocolInfo = tc.getProtocolInfo + } + + if tc.getContext == nil { + tc.getContext = test.Context + } + result, err := p.GetFabricInterfaces(tc.getContext(t), tc.wantProv) + + test.CmpErr(t, tc.expErr, err) + if diff := cmp.Diff(tc.expResult, result, cmp.AllowUnexported( + hardware.FabricInterfaceSet{}, + hardware.FabricProviderSet{}, + )); diff != "" { + t.Fatalf("want-, got+:\n%s", diff) + } + }) + } +} diff --git a/src/control/lib/hardware/cart/ucx.go b/src/control/lib/hardware/cart/ucx.go new file mode 100644 index 00000000000..c9881e88574 --- /dev/null +++ b/src/control/lib/hardware/cart/ucx.go @@ -0,0 +1,87 @@ +// +// (C) Copyright 2022-2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package cart + +import ( + "strings" + + "github.com/daos-stack/daos/src/control/lib/hardware" +) + +const ( + ucxTCPPriority = 25 + ucxCatchallPriority = 99 +) + +// getOSNameFromUCXDevice gets the OS-level device name based on the UCX device name. +func getOSNameFromUCXDevice(ucxDevice string) string { + // the device name is in a format like "mlx5_0:1" + return strings.Split(ucxDevice, ":")[0] +} + +// getProviderSetFromUCXTransport gets the set of DAOS providers associated with a UCX transport. +func getProviderSetFromUCXTransport(transport string) *hardware.FabricProviderSet { + if transport == "" { + return hardware.NewFabricProviderSet() + } + genericTransport := strings.Split(transport, "_")[0] + + priority := 0 // by default use the highest + daosProv := ucxTransportToDAOSProvider(transport) + if daosProv == "ucx+tcp" { + priority = ucxTCPPriority // TCP is less desirable than other options if this is Infiniband + } + providers := hardware.NewFabricProviderSet( + &hardware.FabricProvider{ + Name: daosProv, + Priority: priority, + }, + ) + if shouldAddGeneric(transport) { + providers.Add(&hardware.FabricProvider{ + Name: ucxTransportToDAOSProvider(genericTransport), + Priority: priority, + }) + } + // Any interface with at least one provider should allow ucx+all + providers.Add(&hardware.FabricProvider{ + Name: "ucx+all", + Priority: ucxCatchallPriority, + }) + return providers +} + +func shouldAddGeneric(transport string) bool { + genericTransportAliases := []string{"rc", "ud", "dc"} + for _, alias := range genericTransportAliases { + if strings.HasPrefix(transport, alias+"_") { + return true + } + } + return false +} + +// ucxTransportToDAOSProvider translates the UCX transport type to a DAOS fabric provider string. +func ucxTransportToDAOSProvider(transport string) string { + prefix := "ucx+" + transportPieces := strings.Split(transport, "_") + if len(transportPieces) < 2 { + return prefix + transport + } + + // Transport strings from the library need to be translated to the supported aliases. + // UCX transport aliases: + // https://openucx.readthedocs.io/en/master/faq.html#list-of-main-transports-and-aliases + switch { + case transportPieces[1] == "verbs": + transportPieces[1] = "v" + case strings.HasPrefix(transportPieces[1], "mlx"): + // accelerated Mellanox transport + transportPieces[1] = "x" + } + return prefix + strings.Join(transportPieces, "_") +} diff --git a/src/control/lib/hardware/ucx/ucx_test.go b/src/control/lib/hardware/cart/ucx_test.go similarity index 63% rename from src/control/lib/hardware/ucx/ucx_test.go rename to src/control/lib/hardware/cart/ucx_test.go index c8928817961..b9d2e954120 100644 --- a/src/control/lib/hardware/ucx/ucx_test.go +++ b/src/control/lib/hardware/cart/ucx_test.go @@ -1,10 +1,10 @@ // -// (C) Copyright 2022 Intel Corporation. +// (C) Copyright 2022-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // -package ucx +package cart import ( "testing" @@ -13,14 +13,49 @@ import ( "github.com/daos-stack/daos/src/control/common/test" "github.com/daos-stack/daos/src/control/lib/hardware" - "github.com/daos-stack/daos/src/control/logging" ) -func TestUCX_Provider_getProviderSet(t *testing.T) { +func TestCart_getOSNameFromUCXDevice(t *testing.T) { + for name, tc := range map[string]struct { + in string + expResult string + }{ + "empty": {}, + "no port": { + in: "dev0_1", + expResult: "dev0_1", + }, + "port": { + in: "dev0_1:1", + expResult: "dev0_1", + }, + } { + t.Run(name, func(t *testing.T) { + test.AssertEqual(t, tc.expResult, getOSNameFromUCXDevice(tc.in), "") + }) + } +} + +func TestCart_getProviderSetFromUCXTransport(t *testing.T) { for name, tc := range map[string]struct { in string expSet *hardware.FabricProviderSet }{ + "empty": { + expSet: hardware.NewFabricProviderSet(), + }, + "custom": { + in: "custom", + expSet: hardware.NewFabricProviderSet( + &hardware.FabricProvider{ + Name: "ucx+custom", + }, + &hardware.FabricProvider{ + Name: "ucx+all", + Priority: ucxCatchallPriority, + }, + ), + }, "dc": { in: "dc_mlx5", expSet: hardware.NewFabricProviderSet( @@ -32,7 +67,7 @@ func TestUCX_Provider_getProviderSet(t *testing.T) { }, &hardware.FabricProvider{ Name: "ucx+all", - Priority: catchallPriority, + Priority: ucxCatchallPriority, }, ), }, @@ -41,11 +76,11 @@ func TestUCX_Provider_getProviderSet(t *testing.T) { expSet: hardware.NewFabricProviderSet( &hardware.FabricProvider{ Name: "ucx+tcp", - Priority: tcpPriority, + Priority: ucxTCPPriority, }, &hardware.FabricProvider{ Name: "ucx+all", - Priority: catchallPriority, + Priority: ucxCatchallPriority, }, ), }, @@ -60,7 +95,7 @@ func TestUCX_Provider_getProviderSet(t *testing.T) { }, &hardware.FabricProvider{ Name: "ucx+all", - Priority: catchallPriority, + Priority: ucxCatchallPriority, }, ), }, @@ -75,18 +110,13 @@ func TestUCX_Provider_getProviderSet(t *testing.T) { }, &hardware.FabricProvider{ Name: "ucx+all", - Priority: catchallPriority, + Priority: ucxCatchallPriority, }, ), }, } { t.Run(name, func(t *testing.T) { - log, buf := logging.NewTestLogger(t.Name()) - defer test.ShowBufferOnFailure(t, buf) - - p := NewProvider(log) - - set := p.getProviderSet(tc.in) + set := getProviderSetFromUCXTransport(tc.in) if diff := cmp.Diff(tc.expSet, set, cmp.AllowUnexported(hardware.FabricProviderSet{})); diff != "" { t.Fatalf("(-want, +got)\n%s\n", diff) @@ -95,11 +125,15 @@ func TestUCX_Provider_getProviderSet(t *testing.T) { } } -func TestUCX_transportToDAOSProvider(t *testing.T) { +func TestCart_ucxTransportToDAOSProvider(t *testing.T) { for name, tc := range map[string]struct { in string exp string }{ + "custom": { + in: "custom", + exp: "ucx+custom", + }, "rc_verbs": { in: "rc_verbs", exp: "ucx+rc_v", @@ -138,7 +172,7 @@ func TestUCX_transportToDAOSProvider(t *testing.T) { }, } { t.Run(name, func(t *testing.T) { - test.AssertEqual(t, tc.exp, transportToDAOSProvider(tc.in), "") + test.AssertEqual(t, tc.exp, ucxTransportToDAOSProvider(tc.in), "") }) } } diff --git a/src/control/lib/hardware/hwprov/defaults.go b/src/control/lib/hardware/hwprov/defaults.go index 447e41bffa9..acb8e557463 100644 --- a/src/control/lib/hardware/hwprov/defaults.go +++ b/src/control/lib/hardware/hwprov/defaults.go @@ -7,13 +7,10 @@ package hwprov import ( - "github.com/pkg/errors" - "github.com/daos-stack/daos/src/control/lib/hardware" + "github.com/daos-stack/daos/src/control/lib/hardware/cart" "github.com/daos-stack/daos/src/control/lib/hardware/hwloc" - "github.com/daos-stack/daos/src/control/lib/hardware/libfabric" "github.com/daos-stack/daos/src/control/lib/hardware/sysfs" - "github.com/daos-stack/daos/src/control/lib/hardware/ucx" "github.com/daos-stack/daos/src/control/logging" ) @@ -39,9 +36,8 @@ func DefaultProcessNUMAProvider(log logging.Logger) hardware.ProcessNUMAProvider // DefaultFabricInterfaceProviders returns the default fabric interface providers. func DefaultFabricInterfaceProviders(log logging.Logger) []hardware.FabricInterfaceProvider { return []hardware.FabricInterfaceProvider{ - libfabric.NewProvider(log), + cart.NewProvider(log), sysfs.NewProvider(log), - ucx.NewProvider(log), } } @@ -74,37 +70,6 @@ func DefaultNetDevStateProvider(log logging.Logger) hardware.NetDevStateProvider return sysfs.NewProvider(log) } -// Init loads up any dynamic libraries that need to be loaded at runtime. -func Init(log logging.Logger) (func(), error) { - initFns := []func() (func(), error){ - libfabric.Load, - ucx.Load, - } - - cleanupFns := make([]func(), 0) - numLoaded := 0 - - for _, loadLib := range initFns { - if cleanupLib, err := loadLib(); err == nil { - numLoaded++ - cleanupFns = append(cleanupFns, cleanupLib) - } else { - log.Debug(err.Error()) - } - } - - if numLoaded == 0 { - return nil, errors.New("unable to load any supported fabric libraries") - } - - return func() { - // Unload libraries in reverse order - for i := len(cleanupFns) - 1; i >= 0; i-- { - cleanupFns[i]() - } - }, nil -} - // DefaultIOMMUDetector gets the default provider for the IOMMU detector. func DefaultIOMMUDetector(log logging.Logger) hardware.IOMMUDetector { return sysfs.NewProvider(log) diff --git a/src/control/lib/hardware/hwprov/defaults_test.go b/src/control/lib/hardware/hwprov/defaults_test.go index 57cc720e5c3..17f0040ad74 100644 --- a/src/control/lib/hardware/hwprov/defaults_test.go +++ b/src/control/lib/hardware/hwprov/defaults_test.go @@ -14,10 +14,9 @@ import ( "github.com/daos-stack/daos/src/control/common/test" "github.com/daos-stack/daos/src/control/lib/hardware" + "github.com/daos-stack/daos/src/control/lib/hardware/cart" "github.com/daos-stack/daos/src/control/lib/hardware/hwloc" - "github.com/daos-stack/daos/src/control/lib/hardware/libfabric" "github.com/daos-stack/daos/src/control/lib/hardware/sysfs" - "github.com/daos-stack/daos/src/control/lib/hardware/ucx" "github.com/daos-stack/daos/src/control/logging" ) @@ -67,17 +66,15 @@ func TestHwprov_DefaultFabricInterfaceProviders(t *testing.T) { defer test.ShowBufferOnFailure(t, buf) expResult := []hardware.FabricInterfaceProvider{ - libfabric.NewProvider(log), + cart.NewProvider(log), sysfs.NewProvider(log), - ucx.NewProvider(log), } result := DefaultFabricInterfaceProviders(log) if diff := cmp.Diff(expResult, result, - cmpopts.IgnoreUnexported(libfabric.Provider{}), + cmpopts.IgnoreUnexported(cart.Provider{}), cmpopts.IgnoreUnexported(sysfs.Provider{}), - cmpopts.IgnoreUnexported(ucx.Provider{}), ); diff != "" { t.Fatalf("(-want, +got)\n%s\n", diff) } @@ -115,7 +112,7 @@ func TestHwprov_DefaultFabricScannerConfig(t *testing.T) { cmpopts.IgnoreUnexported( hardware.TopologyFactory{}, hwloc.Provider{}, - libfabric.Provider{}, + cart.Provider{}, sysfs.Provider{}, ), test.CmpOptIgnoreFieldAnyType("log"), @@ -146,7 +143,7 @@ func TestHwprov_DefaultFabricScanner(t *testing.T) { ), cmpopts.IgnoreUnexported( hwloc.Provider{}, - libfabric.Provider{}, + cart.Provider{}, sysfs.Provider{}, ), test.CmpOptIgnoreFieldAnyType("log"), diff --git a/src/control/lib/hardware/libfabric/bindings.go b/src/control/lib/hardware/libfabric/bindings.go deleted file mode 100644 index 65c6a3ac5a2..00000000000 --- a/src/control/lib/hardware/libfabric/bindings.go +++ /dev/null @@ -1,239 +0,0 @@ -// -// (C) Copyright 2021-2022 Intel Corporation. -// -// SPDX-License-Identifier: BSD-2-Clause-Patent -// - -package libfabric - -/* -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define getHFIUnitError -2 -typedef struct { - uint64_t reserved_1; - uint8_t reserved_2; - int8_t unit; - uint8_t port; - uint8_t reserved_3; - uint32_t service; -} psmx2_ep_name; - -int get_hfi_unit(void *src_addr) { - psmx2_ep_name *psmx2; - psmx2 = (psmx2_ep_name *)src_addr; - if (!psmx2) - return getHFIUnitError; - return psmx2->unit; -} - -// Major and minor versions are hard-coded per libfabric recommendations -uint lib_fabric_version(void) -{ - return FI_VERSION(1, 7); -} - -// call into C functions that are looked up at runtime - -int call_fi_getinfo(void *fn, struct fi_info *hint, struct fi_info **info) -{ - int (*getinfo)(uint, char*, char*, ulong, struct fi_info*, struct fi_info**); - - assert(fn != NULL); - getinfo = (int (*)(uint, char*, char*, ulong, struct fi_info*, struct fi_info**))fn; - return getinfo(lib_fabric_version(), NULL, NULL, 0, hint, info); -} - -struct fi_info *call_fi_dupinfo(void *fn, struct fi_info *info) -{ - struct fi_info *(*dupinfo)(struct fi_info*); - - assert(fn != NULL); - dupinfo = (struct fi_info *(*)(struct fi_info*))fn; - return dupinfo(info); -} - -void call_fi_freeinfo(void *fn, struct fi_info *info) -{ - void (*freeinfo)(struct fi_info*); - - assert(fn != NULL); - freeinfo = (void (*)(struct fi_info*))fn; - freeinfo(info); -} - -char *call_fi_strerror(void *fn, int code) -{ - char *(*strerr)(int); - - assert(fn != NULL); - strerr = (char *(*)(int))fn; - return strerr(code); -} -*/ -import "C" - -import ( - "fmt" - "unsafe" - - "github.com/pkg/errors" - - "github.com/daos-stack/daos/src/control/lib/dlopen" - "github.com/daos-stack/daos/src/control/logging" -) - -// Load dynamically loads the libfabric library and provides a method to unload it. -func Load() (func(), error) { - hdl, err := openLib() - if err != nil { - return nil, errors.Wrap(err, "loading libfabric") - } - return func() { - hdl.Close() - }, nil -} - -func openLib() (*dlopen.LibHandle, error) { - return dlopen.GetHandle([]string{"libfabric.so.1", "libfabric.so"}) -} - -func libFabricVersion() C.uint { - return C.lib_fabric_version() -} - -var errHFIUnitsInUse = errors.New("all HFI units in use") - -type fiInfo struct { - cFI *C.struct_fi_info -} - -func (f *fiInfo) domainName() string { - if f.cFI == nil || f.cFI.domain_attr == nil || f.cFI.domain_attr.name == nil { - return "" - } - return C.GoString(f.cFI.domain_attr.name) -} - -func (f *fiInfo) fabricProvider() string { - if f.cFI == nil || f.cFI.fabric_attr == nil || f.cFI.fabric_attr.prov_name == nil { - return "" - } - return C.GoString(f.cFI.fabric_attr.prov_name) -} - -func (f *fiInfo) hfiUnit() (uint, error) { - hfiUnit := C.get_hfi_unit(f.cFI.src_addr) - switch hfiUnit { - case C.getHFIUnitError: - return 0, errors.New("failed to get HFI unit") - case -1: - return 0, errHFIUnitsInUse - } - return uint(hfiUnit), nil -} - -// fiGetInfo fetches the list of fi_info structs with the desired provider (if non-empty), or all of -// them otherwise. It also returns the cleanup function to free the fi_info. -func fiGetInfo(log logging.Logger, hdl *dlopen.LibHandle, prov string) ([]*fiInfo, func() error, error) { - getInfoPtr, err := getLibFuncPtr(hdl, "fi_getinfo") - if err != nil { - return nil, nil, err - } - - var hint *C.struct_fi_info - if len(prov) != 0 { - var cleanupHint func() error - hint, cleanupHint, err = fiAllocInfo(hdl) - if err != nil { - return nil, nil, errors.Wrap(err, "allocating fi_info hint") - } - defer func() { - if cleanupErr := cleanupHint(); cleanupErr != nil && err != nil { - log.Errorf("failed to clean up fi_info hint: %s", err.Error()) - } - }() - - hint.fabric_attr.prov_name = C.CString(prov) - } - - var fi *C.struct_fi_info - fiList := make([]*fiInfo, 0) - result := C.call_fi_getinfo(getInfoPtr, hint, &fi) - switch { - case result == -C.FI_ENODATA: // unable to get anything for the requested provider - return fiList, func() error { return nil }, nil - case result < 0: - return nil, nil, errors.Errorf("fi_getinfo() failed: %s", fiStrError(hdl, -result)) - case fi == nil: - return nil, nil, errors.Errorf("fi_getinfo() returned no results") - } - - for ; fi != nil; fi = fi.next { - fiList = append(fiList, &fiInfo{ - cFI: fi, - }) - } - - return fiList, func() error { - return fiFreeInfo(hdl, fi) - }, nil -} - -func fiAllocInfo(hdl *dlopen.LibHandle) (*C.struct_fi_info, func() error, error) { - dupPtr, err := getLibFuncPtr(hdl, "fi_dupinfo") - if err != nil { - return nil, nil, err - } - - result := C.call_fi_dupinfo(dupPtr, nil) - if result == nil { - return nil, nil, errors.New("fi_dupinfo() failed") - } - - return result, func() error { - return fiFreeInfo(hdl, result) - }, nil -} - -func fiStrError(hdl *dlopen.LibHandle, result C.int) string { - ptr, err := getLibFuncPtr(hdl, "fi_strerror") - if err != nil { - return fmt.Sprintf("%d (%s)", result, err.Error()) - } - - cStr := C.call_fi_strerror(ptr, -result) - return C.GoString(cStr) -} - -func fiFreeInfo(hdl *dlopen.LibHandle, info *C.struct_fi_info) error { - ptr, err := getLibFuncPtr(hdl, "fi_freeinfo") - if err != nil { - return err - } - - C.call_fi_freeinfo(ptr, info) - return nil -} - -func getLibFuncPtr(hdl *dlopen.LibHandle, fnName string) (unsafe.Pointer, error) { - fnPtr, err := hdl.GetSymbolPointer(fnName) - if err != nil { - return nil, err - } - - if fnPtr == nil { - return nil, errors.Errorf("%q is nil", fnName) - } - - return fnPtr, nil -} diff --git a/src/control/lib/hardware/libfabric/provider.go b/src/control/lib/hardware/libfabric/provider.go deleted file mode 100644 index 23d330ee2b8..00000000000 --- a/src/control/lib/hardware/libfabric/provider.go +++ /dev/null @@ -1,154 +0,0 @@ -// -// (C) Copyright 2021-2022 Intel Corporation. -// -// SPDX-License-Identifier: BSD-2-Clause-Patent -// - -package libfabric - -import ( - "context" - "strings" - - "github.com/pkg/errors" - - "github.com/daos-stack/daos/src/control/lib/hardware" - "github.com/daos-stack/daos/src/control/logging" -) - -// NewProvider creates a new libfabric data provider. -func NewProvider(log logging.Logger) *Provider { - return &Provider{ - log: log, - } -} - -// Provider provides information from libfabric's API. -type Provider struct { - log logging.Logger -} - -// GetFabricInterfaces harvests the collection of fabric interfaces from libfabric. -func (p *Provider) GetFabricInterfaces(ctx context.Context, provider string) (*hardware.FabricInterfaceSet, error) { - ch := make(chan *fabricResult) - go p.getFabricInterfaces(provider, ch) - select { - case <-ctx.Done(): - return nil, ctx.Err() - case result := <-ch: - return result.fiSet, result.err - } -} - -type fabricResult struct { - fiSet *hardware.FabricInterfaceSet - err error -} - -func (p *Provider) getFabricInterfaces(provider string, ch chan *fabricResult) { - hdl, err := openLib() - if err != nil { - ch <- &fabricResult{ - err: err, - } - return - } - defer hdl.Close() - - fiInfo, cleanup, err := fiGetInfo(p.log, hdl, extProviderToLibFabric(provider)) - if err != nil { - ch <- &fabricResult{ - err: err, - } - return - } - defer func() { - if err := cleanup(); err != nil { - p.log.Errorf("unable to clean up fi_info: %s", err.Error()) - } - }() - - fis := hardware.NewFabricInterfaceSet() - - for i, info := range fiInfo { - newFI, err := p.infoToFabricInterface(info, i) - if err != nil { - p.log.Error(err.Error()) - continue - } - fis.Update(newFI) - } - - p.log.Tracef("found fabric interfaces:\n%s", fis) - - ch <- &fabricResult{ - fiSet: fis, - } -} - -type info interface { - domainName() string - fabricProvider() string -} - -func (p *Provider) infoToFabricInterface(fi info, priority int) (*hardware.FabricInterface, error) { - if fi == nil { - return nil, errors.New("nil FI info") - } - - name := fi.domainName() - if name == "" { - return nil, errors.New("libfabric info has no domain name") - } - - lfProvider := fi.fabricProvider() - extProvider, err := libFabricProviderListToExt(lfProvider) - if err != nil { - return nil, errors.Errorf("failed to parse provider %q: %s", lfProvider, err.Error()) - } - - newFI := &hardware.FabricInterface{ - Name: name, - OSName: name, - Providers: hardware.NewFabricProviderSet(&hardware.FabricProvider{ - Name: extProvider, - Priority: priority, - }), - } - return newFI, nil -} - -func extProviderToLibFabric(provider string) string { - return strings.TrimPrefix(provider, "ofi+") -} - -// libFabricProviderToExt converts a single libfabric provider string into a DAOS provider string -func libFabricProviderToExt(provider string) string { - if provider == "ofi_rxm" { - return provider - } - return "ofi+" + provider -} - -// libFabricProviderListToExt converts a libfabric provider string containing one or more providers -// separated by ';' into a DAOS compatible provider string. -func libFabricProviderListToExt(providerList string) (string, error) { - var result string - - trimmedList := strings.TrimSpace(providerList) - - if len(trimmedList) == 0 { - return "", errors.New("provider list was empty") - } - - providers := strings.Split(providerList, ";") - for _, subProvider := range providers { - subProvider = strings.TrimSpace(subProvider) - if subProvider == "" { - return "", errors.Errorf("malformed provider list %q", providerList) - } - result += libFabricProviderToExt(subProvider) + ";" - } - - return strings.TrimSuffix(result, ";"), nil -} diff --git a/src/control/lib/hardware/libfabric/provider_test.go b/src/control/lib/hardware/libfabric/provider_test.go deleted file mode 100644 index f218d9cb349..00000000000 --- a/src/control/lib/hardware/libfabric/provider_test.go +++ /dev/null @@ -1,216 +0,0 @@ -// -// (C) Copyright 2021-2022 Intel Corporation. -// -// SPDX-License-Identifier: BSD-2-Clause-Patent -// - -package libfabric - -import ( - "context" - "errors" - "testing" - "time" - - "github.com/google/go-cmp/cmp" - - "github.com/daos-stack/daos/src/control/common/test" - "github.com/daos-stack/daos/src/control/lib/hardware" - "github.com/daos-stack/daos/src/control/logging" -) - -func TestLibfabric_Provider_GetFabricInterfaces_Integrated(t *testing.T) { - cleanup, err := Load() - if err != nil { - t.Skipf("libfabric not installed (%s)", err.Error()) - } - defer cleanup() - - for name, tc := range map[string]struct { - provider string - }{ - "all": {}, - "tcp": { - provider: "ofi+tcp", - }, - "not valid": { - provider: "fake", - }, - } { - t.Run(name, func(t *testing.T) { - log, buf := logging.NewTestLogger(t.Name()) - defer test.ShowBufferOnFailure(t, buf) - - // Can't mock the underlying libfabric calls, but we can make sure it doesn't crash or - // error on the normal happy path. - - p := NewProvider(log) - - ctx, cancel := context.WithTimeout(test.Context(t), 10*time.Second) - defer cancel() - - result, err := p.GetFabricInterfaces(ctx, tc.provider) - if err != nil { - t.Fatal(err.Error()) - } - - t.Logf("\nwith %s:\n%+v\n", name, result) - }) - } -} - -type mockInfo struct { - domainNameReturn string - fabricProviderReturn string -} - -func (m *mockInfo) domainName() string { - return m.domainNameReturn -} - -func (m *mockInfo) fabricProvider() string { - return m.fabricProviderReturn -} - -func TestLibfabric_Provider_fiInfoToFabricInterfaceSet(t *testing.T) { - testPriority := 5 - for name, tc := range map[string]struct { - in info - expResult *hardware.FabricInterface - expErr error - }{ - "nil": { - expErr: errors.New("nil"), - }, - "no domain": { - in: &mockInfo{ - fabricProviderReturn: "provider_x", - }, - expErr: errors.New("domain name"), - }, - "no provider": { - in: &mockInfo{ - domainNameReturn: "fi0_domain", - }, - expErr: errors.New("provider"), - }, - "success": { - in: &mockInfo{ - domainNameReturn: "fi0_domain", - fabricProviderReturn: "provider_x", - }, - expResult: &hardware.FabricInterface{ - Name: "fi0_domain", - OSName: "fi0_domain", - Providers: hardware.NewFabricProviderSet( - &hardware.FabricProvider{ - Name: "ofi+provider_x", - Priority: testPriority, - }, - ), - }, - }, - } { - t.Run(name, func(t *testing.T) { - log, buf := logging.NewTestLogger(name) - defer test.ShowBufferOnFailure(t, buf) - - p := NewProvider(log) - - result, err := p.infoToFabricInterface(tc.in, testPriority) - - test.CmpErr(t, tc.expErr, err) - if diff := cmp.Diff(tc.expResult, result, cmp.AllowUnexported(hardware.FabricProviderSet{})); diff != "" { - t.Errorf("(-want, +got)\n%s\n", diff) - } - }) - } -} - -func TestLibfabric_libFabricProviderListToExt(t *testing.T) { - for name, tc := range map[string]struct { - in string - expOut string - expErr error - }{ - "empty": { - expErr: errors.New("empty"), - }, - "all whitespace": { - in: "\t\n ", - expErr: errors.New("empty"), - }, - "sockets": { - in: "sockets", - expOut: "ofi+sockets", - }, - "tcp": { - in: "tcp", - expOut: "ofi+tcp", - }, - "tcp with ofi_rxm": { - in: "tcp;ofi_rxm", - expOut: "ofi+tcp;ofi_rxm", - }, - "verbs": { - in: "verbs", - expOut: "ofi+verbs", - }, - "verbs with ofi_rxm": { - in: "verbs;ofi_rxm", - expOut: "ofi+verbs;ofi_rxm", - }, - "psm2": { - in: "psm2", - expOut: "ofi+psm2", - }, - "gni": { - in: "gni", - expOut: "ofi+gni", - }, - "cxi": { - in: "cxi", - expOut: "ofi+cxi", - }, - "unknown": { - in: "provider_x", - expOut: "ofi+provider_x", - }, - "badly formed": { - in: " ;ofi_rxm", - expErr: errors.New("malformed"), - }, - } { - t.Run(name, func(t *testing.T) { - out, err := libFabricProviderListToExt(tc.in) - - test.CmpErr(t, tc.expErr, err) - test.AssertEqual(t, tc.expOut, out, "") - }) - } -} - -func TestLibfabric_extProviderToLibFabric(t *testing.T) { - for name, tc := range map[string]struct { - in string - expOut string - }{ - "empty": {}, - "no ofi prefix": { - in: "tcp", - expOut: "tcp", - }, - "ofi prefix": { - in: "ofi+verbs", - expOut: "verbs", - }, - "some other prefix": { - in: "ucx+tcp", - expOut: "ucx+tcp", - }, - } { - t.Run(name, func(t *testing.T) { - test.AssertEqual(t, tc.expOut, extProviderToLibFabric(tc.in), "") - }) - } -} diff --git a/src/control/lib/hardware/ucx/bindings.go b/src/control/lib/hardware/ucx/bindings.go deleted file mode 100644 index b956cb5b917..00000000000 --- a/src/control/lib/hardware/ucx/bindings.go +++ /dev/null @@ -1,508 +0,0 @@ -// -// (C) Copyright 2022 Intel Corporation. -// -// SPDX-License-Identifier: BSD-2-Clause-Patent -// -//go:build ucx -// +build ucx - -package ucx - -/* -#include -#include -#include - -void -call_ucs_debug_disable_signal(void *fn, int signum) -{ - void (*disable)(int); - - assert(fn != NULL); - disable = fn; - - disable(signum); -} - -uct_component_h -get_component_from_list(uct_component_h *components, uint idx) -{ - assert(components != NULL); - - return components[idx]; -} - -char * -get_component_attr_name(uct_component_attr_t *attr) -{ - assert(attr != NULL); - - return attr->name; -} - -char * -get_md_resource_name_from_list(uct_md_resource_desc_t *mdr, uint idx) -{ - assert(mdr != NULL); - - return mdr[idx].md_name; -} - -char * -get_tl_resource_name_from_list(uct_tl_resource_desc_t *tlr, uint idx) -{ - assert(tlr != NULL); - - return tlr[idx].tl_name; -} - -char * -get_tl_resource_device_from_list(uct_tl_resource_desc_t *tlr, uint idx) -{ - assert(tlr != NULL); - - return tlr[idx].dev_name; -} - -uct_device_type_t -get_tl_resource_type_from_list(uct_tl_resource_desc_t *tlr, uint idx) -{ - assert(tlr != NULL); - - return tlr[idx].dev_type; -} - -const char * -get_tl_resource_dev_type_str(void *map, uct_device_type_t dev_type) -{ - const char **names = (const char **)map; - - return names[dev_type]; -} - -void -alloc_uct_component_attr_md_resources(uct_component_attr_t *attr, unsigned count) -{ - attr->md_resources = calloc(sizeof(*attr->md_resources), count); -} - -ucs_status_t -call_uct_query_components(void *fn, uct_component_h **components, unsigned *num_components) -{ - ucs_status_t (*query)(uct_component_h **, unsigned *); - - assert(fn != NULL); - query = fn; - - return query(components, num_components); -} - -void -call_uct_release_component_list(void *fn, uct_component_h *components) -{ - void (*release)(uct_component_h *); - - assert(fn != NULL); - release = fn; - - release(components); -} - -ucs_status_t -call_uct_component_query(void *fn, uct_component_h component, uct_component_attr_t *attr) -{ - ucs_status_t (*query)(uct_component_h, uct_component_attr_t *); - - assert(fn != NULL); - query = fn; - - return query(component, attr); -} - -ucs_status_t -call_uct_md_config_read(void *fn, uct_component_h component, uct_md_config_t **md_config) -{ - ucs_status_t (*cfg_read)(uct_component_h, const char *, const char *, uct_md_config_t **); - - assert(fn != NULL); - cfg_read = fn; - - return cfg_read(component, NULL, NULL, md_config); -} - -void -call_uct_config_release(void *fn, void *config) -{ - void (*release)(void *); - - assert(fn != NULL); - release = fn; - - release(config); -} - -ucs_status_t -call_uct_md_open(void *fn, uct_component_h component, const char *md_name, - const uct_md_config_t *config, uct_md_h *md) -{ - ucs_status_t (*md_open)(uct_component_h, const char *, const uct_md_config_t *, uct_md_h *); - - assert(fn != NULL); - md_open = fn; - - return md_open(component, md_name, config, md); -} - -void -call_uct_md_close(void *fn, uct_md_h md) -{ - void (*md_close)(uct_md_h); - - assert(fn != NULL); - md_close = fn; - - md_close(md); -} - -ucs_status_t -call_uct_md_query_tl_resources(void *fn, uct_md_h md, uct_tl_resource_desc_t **resources, - unsigned *num_resources) -{ - ucs_status_t (*query_tl)(uct_md_h, uct_tl_resource_desc_t **, unsigned *); - - assert(fn != NULL); - query_tl = fn; - - return query_tl(md, resources, num_resources); -} - -void -call_uct_release_tl_resource_list(void *fn, uct_tl_resource_desc_t *resources) -{ - void (*release)(uct_tl_resource_desc_t *); - - assert(fn != NULL); - release = fn; - - release(resources); -} -*/ -import "C" - -import ( - "fmt" - "syscall" - "unsafe" - - "github.com/pkg/errors" - - "github.com/daos-stack/daos/src/control/lib/dlopen" -) - -// Load dynamically loads the UCX libraries and provides a method to unload them. -func Load() (func(), error) { - ucsHdl, err := openUCS() - if err != nil { - return nil, errors.Wrap(err, "loading libucs") - } - defer ucsHdl.Close() - - if err := ucsDisableSignal(ucsHdl, syscall.SIGSEGV); err != nil { - return nil, errors.Wrap(err, "disabling UCX signal handling") - } - - hdl, err := openUCT() - if err != nil { - return nil, errors.Wrap(err, "loading libuct") - } - return func() { - hdl.Close() - }, nil -} - -func openUCT() (*dlopen.LibHandle, error) { - return dlopen.GetHandle([]string{"libuct.so.0"}) -} - -func openUCS() (*dlopen.LibHandle, error) { - return dlopen.GetHandle([]string{"libucs.so.0"}) -} - -func ucsDisableSignal(hdl *dlopen.LibHandle, sig syscall.Signal) error { - fn, err := getLibFuncPtr(hdl, "ucs_debug_disable_signal") - if err != nil { - return err - } - C.call_ucs_debug_disable_signal(fn, C.int(sig)) - return nil -} - -func getLibFuncPtr(hdl *dlopen.LibHandle, fnName string) (unsafe.Pointer, error) { - fnPtr, err := hdl.GetSymbolPointer(fnName) - if err != nil { - return nil, err - } - - if fnPtr == nil { - return nil, errors.Errorf("%q is nil", fnName) - } - - return fnPtr, nil -} - -type uctComponent struct { - cComponent C.uct_component_h - name string - flags uint64 - mdResourceCount uint -} - -func getUCTComponents(uctHdl *dlopen.LibHandle) ([]*uctComponent, func() error, error) { - cComponents, numComponents, err := uctQueryComponents(uctHdl) - if err != nil { - return nil, nil, errors.Wrap(err, "getting component list") - } - - components := make([]*uctComponent, 0, numComponents) - for i := C.uint(0); i < numComponents; i++ { - comp := C.get_component_from_list(cComponents, i) - - var attr C.uct_component_attr_t - - attr.field_mask = C.UCT_COMPONENT_ATTR_FIELD_NAME | - C.UCT_COMPONENT_ATTR_FIELD_MD_RESOURCE_COUNT | - C.UCT_COMPONENT_ATTR_FIELD_FLAGS - if err := uctComponentQuery(uctHdl, comp, &attr); err != nil { - uctReleaseComponentList(uctHdl, cComponents) - return nil, nil, errors.Wrap(err, "querying component details") - } - - components = append(components, &uctComponent{ - cComponent: comp, - name: C.GoString(C.get_component_attr_name(&attr)), - flags: uint64(attr.flags), - mdResourceCount: uint(attr.md_resource_count), - }) - } - - return components, func() error { - return uctReleaseComponentList(uctHdl, cComponents) - }, nil -} - -func uctQueryComponents(uctHdl *dlopen.LibHandle) (*C.uct_component_h, C.uint, error) { - fn, err := getLibFuncPtr(uctHdl, "uct_query_components") - if err != nil { - return nil, 0, err - } - - var cComponents *C.uct_component_h - var cNumComponents C.uint - - status := C.call_uct_query_components(fn, &cComponents, &cNumComponents) - if status != C.UCS_OK { - return nil, 0, errors.Errorf("uct_query_components() failed: %d", status) - } - return cComponents, cNumComponents, nil -} - -func uctReleaseComponentList(uctHdl *dlopen.LibHandle, components *C.uct_component_h) error { - fn, err := getLibFuncPtr(uctHdl, "uct_release_component_list") - if err != nil { - return err - } - - C.call_uct_release_component_list(fn, components) - return nil -} - -func getMDResourceNames(uctHdl *dlopen.LibHandle, component *uctComponent) ([]string, error) { - var attr C.uct_component_attr_t - - attr.field_mask = C.UCT_COMPONENT_ATTR_FIELD_MD_RESOURCES - C.alloc_uct_component_attr_md_resources(&attr, C.uint(component.mdResourceCount)) - if attr.md_resources == nil { - return nil, errors.New("failed to allocate memory for MD resources") - } - defer C.free(unsafe.Pointer(attr.md_resources)) - - if err := uctComponentQuery(uctHdl, component.cComponent, &attr); err != nil { - return nil, errors.Wrapf(err, "getting MD resources for %q", component.name) - } - - names := make([]string, 0, component.mdResourceCount) - for i := uint(0); i < component.mdResourceCount; i++ { - name := C.get_md_resource_name_from_list(attr.md_resources, C.uint(i)) - names = append(names, C.GoString(name)) - } - - return names, nil -} - -func uctComponentQuery(uctHdl *dlopen.LibHandle, component C.uct_component_h, attr *C.uct_component_attr_t) error { - fn, err := getLibFuncPtr(uctHdl, "uct_component_query") - if err != nil { - return err - } - - status := C.call_uct_component_query(fn, component, attr) - if status != C.UCS_OK { - return errors.Errorf("uct_component_query() failed: %d", status) - } - - return nil -} - -type uctMDConfig struct { - component string - cCfg *C.uct_md_config_t -} - -func getComponentMDConfig(uctHdl *dlopen.LibHandle, comp *uctComponent) (*uctMDConfig, func() error, error) { - mdCfg := &uctMDConfig{ - component: comp.name, - } - - cCfg, err := uctMDConfigRead(uctHdl, comp.cComponent) - if err != nil { - return nil, nil, errors.Wrapf(err, "fetching MD config for %q", comp.name) - } - - mdCfg.cCfg = cCfg - - return mdCfg, func() error { - return errors.Wrap(uctMDConfigFree(uctHdl, unsafe.Pointer(cCfg)), "freeing MD config") - }, nil -} - -func uctMDConfigRead(uctHdl *dlopen.LibHandle, component C.uct_component_h) (*C.uct_md_config_t, error) { - fn, err := getLibFuncPtr(uctHdl, "uct_md_config_read") - if err != nil { - return nil, err - } - - var cfg *C.uct_md_config_t - status := C.call_uct_md_config_read(fn, component, &cfg) - if status != C.UCS_OK { - return nil, errors.Errorf("uct_md_config_read() failed: %d", status) - } - - return cfg, nil -} - -func uctMDConfigFree(uctHdl *dlopen.LibHandle, config unsafe.Pointer) error { - fn, err := getLibFuncPtr(uctHdl, "uct_config_release") - if err != nil { - return err - } - - C.call_uct_config_release(fn, config) - - return nil -} - -type uctMD struct { - name string - cMD C.uct_md_h -} - -func openMDResource(uctHdl *dlopen.LibHandle, comp *uctComponent, mdName string, cfg *uctMDConfig) (*uctMD, func() error, error) { - cMD, err := uctMDOpen(uctHdl, comp.cComponent, mdName, cfg.cCfg) - if err != nil { - return nil, nil, errors.Wrapf(err, "opening MD %q", mdName) - } - - md := &uctMD{ - name: mdName, - cMD: cMD, - } - return md, func() error { - return errors.Wrapf(uctMDClose(uctHdl, cMD), "closing MD %q", mdName) - }, nil -} - -func uctMDOpen(uctHdl *dlopen.LibHandle, component C.uct_component_h, name string, config *C.uct_md_config_t) (C.uct_md_h, error) { - fn, err := getLibFuncPtr(uctHdl, "uct_md_open") - if err != nil { - return nil, err - } - - var md C.uct_md_h - cName := C.CString(name) - defer C.free(unsafe.Pointer(cName)) - status := C.call_uct_md_open(fn, component, cName, config, &md) - if status != C.UCS_OK { - return nil, errors.Errorf("uct_md_open() failed: %d", status) - } - - return md, nil -} - -func uctMDClose(uctHdl *dlopen.LibHandle, md C.uct_md_h) error { - fn, err := getLibFuncPtr(uctHdl, "uct_md_close") - if err != nil { - return err - } - - C.call_uct_md_close(fn, md) - return nil -} - -type transportDev struct { - transport string - device string - devType C.uct_device_type_t -} - -func (d *transportDev) String() string { - return fmt.Sprintf("transport=%q, device=%q", d.transport, d.device) -} - -func (d *transportDev) isNetwork() bool { - return d.devType == C.UCT_DEVICE_TYPE_NET -} - -func getMDTransportDevices(uctHdl *dlopen.LibHandle, md *uctMD) ([]*transportDev, error) { - tlResources, tlResourceCount, err := uctMDQueryTLResources(uctHdl, md.cMD) - if err != nil { - return nil, errors.Wrapf(err, "querying TL resources for %q", md.name) - } - defer uctReleaseTLResourceList(uctHdl, tlResources) - - tlDevs := make([]*transportDev, 0, int(tlResourceCount)) - for i := C.uint(0); i < tlResourceCount; i++ { - tlDevs = append(tlDevs, &transportDev{ - transport: C.GoString(C.get_tl_resource_name_from_list(tlResources, i)), - device: C.GoString(C.get_tl_resource_device_from_list(tlResources, i)), - devType: C.get_tl_resource_type_from_list(tlResources, i), - }) - } - - return tlDevs, nil -} - -func uctMDQueryTLResources(uctHdl *dlopen.LibHandle, md C.uct_md_h) (*C.uct_tl_resource_desc_t, C.uint, error) { - fn, err := getLibFuncPtr(uctHdl, "uct_md_query_tl_resources") - if err != nil { - return nil, 0, err - } - - var resources *C.uct_tl_resource_desc_t - var num_resources C.uint - status := C.call_uct_md_query_tl_resources(fn, md, &resources, &num_resources) - if status != C.UCS_OK { - return nil, 0, errors.Errorf("uct_md_query_tl_resources() failed: %d", status) - } - - return resources, num_resources, nil -} - -func uctReleaseTLResourceList(uctHdl *dlopen.LibHandle, resources *C.uct_tl_resource_desc_t) error { - fn, err := getLibFuncPtr(uctHdl, "uct_release_tl_resource_list") - if err != nil { - return err - } - - C.call_uct_release_tl_resource_list(fn, resources) - return nil -} diff --git a/src/control/lib/hardware/ucx/bindings_test.go b/src/control/lib/hardware/ucx/bindings_test.go deleted file mode 100644 index 69b0e765f96..00000000000 --- a/src/control/lib/hardware/ucx/bindings_test.go +++ /dev/null @@ -1,37 +0,0 @@ -// -// (C) Copyright 2022 Intel Corporation. -// -// SPDX-License-Identifier: BSD-2-Clause-Patent -// -//go:build ucx -// +build ucx - -package ucx - -import ( - "fmt" - "testing" -) - -func TestUCXBindings_signalHandling(t *testing.T) { - close, err := Load() - if err != nil { - t.Fatal(err) - } - defer close() - - defer func() { - // We would expect this to allow us to recover from the SIGSEGV - // we'll trigger on ourselves below. - if result := recover(); result != nil { - fmt.Printf("successfully recovered from panic: %+v\n", result) - } - }() - - type Example struct { - Val int - } - - var ex *Example - _ = ex.Val // this will segfault -} diff --git a/src/control/lib/hardware/ucx/disabled_bindings.go b/src/control/lib/hardware/ucx/disabled_bindings.go deleted file mode 100644 index 355e6d85f89..00000000000 --- a/src/control/lib/hardware/ucx/disabled_bindings.go +++ /dev/null @@ -1,68 +0,0 @@ -// -// (C) Copyright 2022 Intel Corporation. -// -// SPDX-License-Identifier: BSD-2-Clause-Patent -// -//go:build !ucx -// +build !ucx - -package ucx - -import ( - "github.com/daos-stack/daos/src/control/lib/dlopen" - "github.com/daos-stack/daos/src/control/lib/hardware" -) - -var errNotSupported = hardware.ErrUnsupportedFabric("ucx") - -// Load reports that the library is not supported. -func Load() (func(), error) { - return nil, errNotSupported -} - -func openUCT() (*dlopen.LibHandle, error) { - return nil, errNotSupported -} - -type uctComponent struct { - name string -} - -func getUCTComponents(uctHdl *dlopen.LibHandle) ([]*uctComponent, func() error, error) { - return nil, nil, errNotSupported -} - -func getMDResourceNames(uctHdl *dlopen.LibHandle, component *uctComponent) ([]string, error) { - return nil, errNotSupported -} - -type uctMDConfig struct { -} - -func getComponentMDConfig(uctHdl *dlopen.LibHandle, comp *uctComponent) (*uctMDConfig, func() error, error) { - return nil, nil, errNotSupported -} - -type uctMD struct { -} - -func openMDResource(uctHdl *dlopen.LibHandle, comp *uctComponent, mdName string, cfg *uctMDConfig) (*uctMD, func() error, error) { - return nil, nil, errNotSupported -} - -type transportDev struct { - transport string - device string -} - -func (d *transportDev) String() string { - return "" -} - -func (d *transportDev) isNetwork() bool { - return false -} - -func getMDTransportDevices(uctHdl *dlopen.LibHandle, md *uctMD) ([]*transportDev, error) { - return nil, errNotSupported -} diff --git a/src/control/lib/hardware/ucx/ucx.go b/src/control/lib/hardware/ucx/ucx.go deleted file mode 100644 index 2ecd6242394..00000000000 --- a/src/control/lib/hardware/ucx/ucx.go +++ /dev/null @@ -1,208 +0,0 @@ -// -// (C) Copyright 2022 Intel Corporation. -// -// SPDX-License-Identifier: BSD-2-Clause-Patent -// - -package ucx - -import ( - "context" - "strings" - - "github.com/daos-stack/daos/src/control/common" - "github.com/daos-stack/daos/src/control/lib/dlopen" - "github.com/daos-stack/daos/src/control/lib/hardware" - "github.com/daos-stack/daos/src/control/logging" -) - -const ( - compInfiniband = "ib" - compTCP = "tcp" - - tcpPriority = 25 - catchallPriority = 99 -) - -// NewProvider creates a new UCX data provider. -func NewProvider(log logging.Logger) *Provider { - return &Provider{ - log: log, - } -} - -// Provider provides information from UCX's API. -type Provider struct { - log logging.Logger -} - -// GetFabricInterfaces harvests the collection of fabric interfaces from UCX. -func (p *Provider) GetFabricInterfaces(ctx context.Context, provider string) (*hardware.FabricInterfaceSet, error) { - uctHdl, err := openUCT() - if err != nil { - return nil, err - } - defer uctHdl.Close() - - components, cleanupComp, err := getUCTComponents(uctHdl) - if err != nil { - return nil, err - } - defer func() { - if err := cleanupComp(); err != nil { - p.log.Errorf("error cleaning up UCT components: %s", err.Error()) - } - }() - - supportedComps := common.NewStringSet(compInfiniband, compTCP) - fis := hardware.NewFabricInterfaceSet() - - for _, comp := range components { - if !supportedComps.Has(comp.name) { - continue - } - - mdResources, err := getMDResourceNames(uctHdl, comp) - if err != nil { - p.log.Error(err.Error()) - continue - } - - cfg, cleanupCfg, err := getComponentMDConfig(uctHdl, comp) - if err != nil { - p.log.Error(err.Error()) - continue - } - defer func(name string) { - if err := cleanupCfg(); err != nil { - p.log.Error(err.Error()) - } - }(comp.name) - - netDevs, err := p.getCompNetworkDevices(uctHdl, comp, mdResources, cfg) - if err != nil { - p.log.Error(err.Error()) - continue - } - - if err := p.addFabricDevices(comp.name, netDevs, provider, fis); err != nil { - p.log.Error(err.Error()) - } - } - - return fis, nil -} - -func (p *Provider) getCompNetworkDevices(uctHdl *dlopen.LibHandle, comp *uctComponent, - mdResources []string, cfg *uctMDConfig) ([]*transportDev, error) { - netDevs := make([]*transportDev, 0) - for _, mdName := range mdResources { - md, cleanupMD, err := openMDResource(uctHdl, comp, mdName, cfg) - if err != nil { - return nil, err - } - defer func() { - if err := cleanupMD(); err != nil { - p.log.Errorf(err.Error()) - } - }() - - tlDevs, err := getMDTransportDevices(uctHdl, md) - if err != nil { - return nil, err - } - - for _, dev := range tlDevs { - if !dev.isNetwork() { - continue - } - - netDevs = append(netDevs, dev) - } - } - - return netDevs, nil -} - -func (p *Provider) addFabricDevices(comp string, netDevs []*transportDev, provider string, fis *hardware.FabricInterfaceSet) error { - allDevs := common.NewStringSet() - for _, dev := range netDevs { - allDevs.AddUnique(dev.device) - } - - for _, dev := range netDevs { - provSet := p.getProviderSet(dev.transport) - if provider != "" && !provSet.Has(provider) { - continue - } - - // the device name is in a format like "mlx5_0:1" - osDev := strings.Split(dev.device, ":")[0] - - fis.Update(&hardware.FabricInterface{ - Name: dev.device, - OSName: osDev, - Providers: p.getProviderSet(dev.transport), - }) - } - - return nil -} - -func (p *Provider) getProviderSet(transport string) *hardware.FabricProviderSet { - genericTransport := strings.Split(transport, "_")[0] - - priority := 0 // by default use the highest - daosProv := transportToDAOSProvider(transport) - if daosProv == "ucx+tcp" { - priority = tcpPriority // TCP is less desirable than other options if this is Infiniband - } - providers := hardware.NewFabricProviderSet( - &hardware.FabricProvider{ - Name: daosProv, - Priority: priority, - }, - ) - if shouldAddGeneric(transport) { - providers.Add(&hardware.FabricProvider{ - Name: transportToDAOSProvider(genericTransport), - Priority: priority, - }) - } - // Any interface with at least one provider should allow ucx+all - providers.Add(&hardware.FabricProvider{ - Name: "ucx+all", - Priority: catchallPriority, - }) - return providers -} - -func shouldAddGeneric(transport string) bool { - genericTransportAliases := []string{"rc", "ud", "dc"} - for _, alias := range genericTransportAliases { - if strings.HasPrefix(transport, alias+"_") { - return true - } - } - return false -} - -func transportToDAOSProvider(transport string) string { - prefix := "ucx+" - transportPieces := strings.Split(transport, "_") - if len(transportPieces) < 2 { - return prefix + transport - } - - // Transport strings from the library need to be translated to the supported aliases. - // UCX transport aliases: - // https://openucx.readthedocs.io/en/master/faq.html#list-of-main-transports-and-aliases - switch { - case transportPieces[1] == "verbs": - transportPieces[1] = "v" - case strings.HasPrefix(transportPieces[1], "mlx"): - // accelerated Mellanox transport - transportPieces[1] = "x" - } - return prefix + strings.Join(transportPieces, "_") -} diff --git a/src/control/lib/hardware/ucx/ucx_interfaces_test.go b/src/control/lib/hardware/ucx/ucx_interfaces_test.go deleted file mode 100644 index 1c66f4478a9..00000000000 --- a/src/control/lib/hardware/ucx/ucx_interfaces_test.go +++ /dev/null @@ -1,56 +0,0 @@ -// -// (C) Copyright 2022 Intel Corporation. -// -// SPDX-License-Identifier: BSD-2-Clause-Patent -// -//go:build ucx -// +build ucx - -package ucx - -import ( - "context" - "fmt" - "testing" - "time" - - "github.com/daos-stack/daos/src/control/common/test" - "github.com/daos-stack/daos/src/control/logging" -) - -func TestUCX_Provider_GetFabricInterfaces_Integrated(t *testing.T) { - cleanup, err := Load() - if err != nil { - t.Errorf("can't load lib (%s)", err.Error()) - } - defer cleanup() - - for name, tc := range map[string]struct { - provider string - }{ - "all": {}, - "tcp": { - provider: "ucx+tcp", - }, - } { - t.Run(name, func(t *testing.T) { - // Can't mock the underlying UCX calls, but we can make sure it doesn't crash or - // error on the normal happy path. - - log, buf := logging.NewTestLogger(t.Name()) - defer test.ShowBufferOnFailure(t, buf) - - p := NewProvider(log) - - ctx, cancel := context.WithTimeout(test.Context(t), 10*time.Second) - defer cancel() - result, err := p.GetFabricInterfaces(ctx, tc.provider) - - if err != nil { - t.Fatal(err.Error()) - } - - fmt.Printf("with %s:\n%s\n", name, result) - }) - } -} diff --git a/src/control/server/server.go b/src/control/server/server.go index bda319671e4..4a3efd655fc 100644 --- a/src/control/server/server.go +++ b/src/control/server/server.go @@ -575,12 +575,6 @@ func Start(log logging.Logger, cfg *config.Server) error { return err } - hwprovFini, err := hwprov.Init(log) - if err != nil { - return err - } - defer hwprovFini() - if err := waitFabricReady(ctx, log, cfg); err != nil { return err } diff --git a/src/tests/ftest/control/dmg_network_scan.py b/src/tests/ftest/control/dmg_network_scan.py index 30491721834..0ccabbf4a57 100644 --- a/src/tests/ftest/control/dmg_network_scan.py +++ b/src/tests/ftest/control/dmg_network_scan.py @@ -32,7 +32,7 @@ def get_sys_info(self): server_provider = self.server_managers[0].get_config_value("provider") sys_info = [] for entry in get_network_information(self.log, self.hostlist_servers, SUPPORTED_PROVIDERS): - if server_provider in entry.provider: + if server_provider == entry.provider: entry.device = None sys_info.append(entry) return sys_info diff --git a/utils/githooks/pre-commit.d/60-gofmt.sh b/utils/githooks/pre-commit.d/60-gofmt.sh index 0f73fb30ab3..022c2466427 100755 --- a/utils/githooks/pre-commit.d/60-gofmt.sh +++ b/utils/githooks/pre-commit.d/60-gofmt.sh @@ -12,10 +12,10 @@ echo "Gofmt:" go_files= if [ "$TARGET" = "HEAD" ]; then echo " Checking against HEAD" - go_files=$(git diff HEAD --name-only | grep -e '.go$' || exit 0) + go_files=$(git diff HEAD --name-only --diff-filter=d | grep -e '.go$' || exit 0) else echo " Checking against branch ${TARGET}" - go_files=$(git diff "$TARGET"... --name-only | grep -e '.go$' || exit 0) + go_files=$(git diff "$TARGET"... --name-only --diff-filter=d | grep -e '.go$' || exit 0) fi if [ -z "$go_files" ]; then