Skip to content

Commit

Permalink
DAOS-16687 control: Handle missing PCIe caps in storage query usage (#…
Browse files Browse the repository at this point in the history
…15296) (#15392)

Missing PCIe capabilities when querying a NVMe SSD's configuration
space is unusual but should be handled gracefully by the control-plane
and shouldn't cause a failure to return usage statistics when calling
dmg storage query usage.

Update so that pciutils lib is only called when attempting to display
health stats via dmg and not when fetching usage info. Improve clarity
of workflow to ease maintenance and add test coverage for updates.
Enable continued functionality when NVMe device doesn't return any
extended capabilities in PCIe configuration space data by adding
sentinel error to library for such a case.

Signed-off-by: Tom Nabarro <[email protected]>
  • Loading branch information
tanabarr authored Oct 28, 2024
1 parent 67da3b9 commit c4cf4f7
Show file tree
Hide file tree
Showing 7 changed files with 308 additions and 103 deletions.
46 changes: 28 additions & 18 deletions src/control/common/proto/ctl/storage_nvme.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion src/control/lib/control/storage.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//
// (C) Copyright 2020-2023 Intel Corporation.
// (C) Copyright 2020-2024 Intel Corporation.
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand Down Expand Up @@ -258,6 +258,8 @@ func StorageScan(ctx context.Context, rpcClient UnaryInvoker, req *StorageScanRe
// Health and meta details required to populate usage statistics.
Health: req.NvmeHealth || req.Usage,
Meta: req.Usage,
// Only request link stats if health explicitly requested.
LinkStats: req.NvmeHealth,
},
})
})
Expand Down
3 changes: 2 additions & 1 deletion src/control/lib/hardware/pciutils/bindings.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ var (
ErrMultiDevices = errors.New("want single device config got multiple")
ErrCfgNotTerminated = errors.New("device config content not new-line terminated")
ErrCfgMissing = errors.New("incomplete device config")
ErrNoPCIeCaps = errors.New("no pci-express capabilities found")
)

// api provides the PCIeLinkStatsProvider interface by exposing a concrete implementation of
Expand Down Expand Up @@ -150,7 +151,7 @@ func (ap *api) PCIeCapsFromConfig(cfgBytes []byte, dev *hardware.PCIDevice) erro
var cp *C.struct_pci_cap = C.pci_find_cap(pciDev, C.PCI_CAP_ID_EXP, C.PCI_CAP_NORMAL)

if cp == nil {
return errors.New("no pci-express capabilities found")
return ErrNoPCIeCaps
}

cpAddr := uint32(cp.addr)
Expand Down
49 changes: 49 additions & 0 deletions src/control/server/ctl_smd_rpc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"github.com/daos-stack/daos/src/control/common/test"
"github.com/daos-stack/daos/src/control/drpc"
"github.com/daos-stack/daos/src/control/lib/daos"
"github.com/daos-stack/daos/src/control/lib/hardware/pciutils"
"github.com/daos-stack/daos/src/control/lib/ranklist"
"github.com/daos-stack/daos/src/control/logging"
"github.com/daos-stack/daos/src/control/server/config"
Expand Down Expand Up @@ -88,6 +89,7 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) {
drpcResps map[int][]*mockDrpcResponse
harnessStopped bool
ioStopped bool
pciDevErr error
expResp *ctlpb.SmdQueryResp
expErr error
}{
Expand Down Expand Up @@ -658,6 +660,46 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) {
},
expErr: daos.FreeMemError,
},
"list-devices; with health; update link stats": {
req: &ctlpb.SmdQueryReq{
OmitPools: true,
Rank: uint32(ranklist.NilRank),
Uuid: test.MockUUID(1),
IncludeBioHealth: true,
},
drpcResps: map[int][]*mockDrpcResponse{
0: {
{
Message: &ctlpb.SmdDevResp{
Devices: []*ctlpb.SmdDevice{pbNormDev(0)},
},
},
},
1: {
{
Message: &ctlpb.SmdDevResp{
Devices: []*ctlpb.SmdDevice{
func() *ctlpb.SmdDevice {
sd := pbFaultDev(1)
sd.Ctrlr.PciCfg = "ABCD"
return sd
}(),
},
},
},
{
Message: &ctlpb.BioHealthResp{
Temperature: 1000000,
TempWarn: true,
},
},
},
},
// Prove mock link stats provider gets called when IncludeBioHealth
// flag is set and Ctrlr.PciCfg string is not empty.
pciDevErr: errors.New("link stats provider fail"),
expErr: errors.New("link stats provider fail"),
},
"ambiguous UUID": {
req: &ctlpb.SmdQueryReq{
Rank: uint32(ranklist.NilRank),
Expand All @@ -680,6 +722,13 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) {
log, buf := logging.NewTestLogger(t.Name())
defer test.ShowBufferOnFailure(t, buf)

linkStatsProv = &mockPCIeLinkStatsProvider{
pciDevErr: tc.pciDevErr,
}
defer func() {
linkStatsProv = pciutils.NewPCIeLinkStatsProvider()
}()

engineCount := len(tc.drpcResps)
if engineCount == 0 {
engineCount = 1
Expand Down
134 changes: 103 additions & 31 deletions src/control/server/instance_storage_rpc.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,19 @@ import (
"github.com/daos-stack/daos/src/control/events"
"github.com/daos-stack/daos/src/control/fault"
"github.com/daos-stack/daos/src/control/lib/hardware"
"github.com/daos-stack/daos/src/control/lib/hardware/hwprov"
"github.com/daos-stack/daos/src/control/lib/hardware/pciutils"
"github.com/daos-stack/daos/src/control/server/storage"
)

var (
scanSmd = listSmdDevices
getCtrlrHealth = getBioHealth
// Function pointers to enable mocking.
scanSmd = listSmdDevices
scanHealth = getBioHealth
linkStatsProv = pciutils.NewPCIeLinkStatsProvider()

// Sentinel errors to enable comparison.
errEngineBdevScanEmptyDevList = errors.New("empty device list for engine instance")
errCtrlrHealthSkipped = errors.New("controller health update was skipped")
)

// newMntRet creates and populates SCM mount result.
Expand Down Expand Up @@ -168,13 +173,17 @@ func (ei *EngineInstance) StorageFormatSCM(ctx context.Context, force bool) (mRe
}

func addLinkInfoToHealthStats(prov hardware.PCIeLinkStatsProvider, pciCfg string, health *ctlpb.BioHealthResp) error {
if health == nil {
return errors.New("nil BioHealthResp")
}

// Convert byte-string to lspci-format.
sb := new(strings.Builder)
formatBytestring(pciCfg, sb)

pciDev := &hardware.PCIDevice{}
if err := prov.PCIeCapsFromConfig([]byte(sb.String()), pciDev); err != nil {
return err
return errors.Wrap(err, "pciutils lib")
}

// Copy link details from PCIDevice to health stats.
Expand Down Expand Up @@ -243,31 +252,74 @@ func publishLinkStatEvents(engine Engine, pciAddr string, stats *ctlpb.BioHealth
lastMaxWidthStr, lastWidthStr, stats.LinkPortId)
}

func populateCtrlrHealth(ctx context.Context, engine Engine, req *ctlpb.BioHealthReq, ctrlr *ctlpb.NvmeController, prov hardware.PCIeLinkStatsProvider) (bool, error) {
stateName := ctlpb.NvmeDevState_name[int32(ctrlr.DevState)]
if !ctrlr.CanSupplyHealthStats() {
engine.Debugf("skip fetching health stats on device %q in %q state",
ctrlr.PciAddr, stateName)
return false, nil
type ctrlrHealthReq struct {
meta bool
engine Engine
bhReq *ctlpb.BioHealthReq
ctrlr *ctlpb.NvmeController
linkStatsProv hardware.PCIeLinkStatsProvider
}

// Retrieve NVMe controller health statistics for those in an acceptable state. Return nil health
// resp if in a bad state.
func getCtrlrHealth(ctx context.Context, req ctrlrHealthReq) (*ctlpb.BioHealthResp, error) {
stateName := ctlpb.NvmeDevState_name[int32(req.ctrlr.DevState)]
if !req.ctrlr.CanSupplyHealthStats() {
req.engine.Debugf("skip fetching health stats on device %q in %q state",
req.ctrlr.PciAddr, stateName)
return nil, errCtrlrHealthSkipped
}

health, err := getCtrlrHealth(ctx, engine, req)
health, err := scanHealth(ctx, req.engine, req.bhReq)
if err != nil {
return false, errors.Wrapf(err, "retrieve health stats for %q (state %q)", ctrlr,
return nil, errors.Wrapf(err, "retrieve health stats for %q (state %q)", req.ctrlr,
stateName)
}

if ctrlr.PciCfg != "" {
if err := addLinkInfoToHealthStats(prov, ctrlr.PciCfg, health); err != nil {
return false, errors.Wrapf(err, "add link stats for %q", ctrlr)
return health, nil
}

// Add link state and capability information to input health statistics for the given controller
// then if successful publish events based on link statistic changes. Link updated health stats to
// controller.
func setCtrlrHealthWithLinkInfo(req ctrlrHealthReq, health *ctlpb.BioHealthResp) error {
err := addLinkInfoToHealthStats(req.linkStatsProv, req.ctrlr.PciCfg, health)
if err == nil {
publishLinkStatEvents(req.engine, req.ctrlr.PciAddr, health)
} else {
if errors.Cause(err) != pciutils.ErrNoPCIeCaps {
return errors.Wrapf(err, "add link stats for %q", req.ctrlr)
}
req.engine.Debugf("device %q not reporting PCIe capabilities", req.ctrlr.PciAddr)
}

return nil
}

// Update controller health statistics and include link info if required and available.
func populateCtrlrHealth(ctx context.Context, req ctrlrHealthReq) (bool, error) {
health, err := getCtrlrHealth(ctx, req)
if err != nil {
if err == errCtrlrHealthSkipped {
// Nothing to do.
return false, nil
}
publishLinkStatEvents(engine, ctrlr.PciAddr, health)
return false, errors.Wrap(err, "get ctrlr health")
}

if req.linkStatsProv == nil {
req.engine.Debugf("device %q skip adding link stats; nil provider",
req.ctrlr.PciAddr)
} else if req.ctrlr.PciCfg == "" {
req.engine.Debugf("device %q skip adding link stats; empty pci cfg",
req.ctrlr.PciAddr)
} else {
engine.Debugf("no pcie config space received for %q, skip add link stats", ctrlr)
if err = setCtrlrHealthWithLinkInfo(req, health); err != nil {
return false, errors.Wrap(err, "set ctrlr health")
}
}

ctrlr.HealthStats = health
ctrlr.PciCfg = ""
req.ctrlr.HealthStats = health
return true, nil
}

Expand Down Expand Up @@ -305,12 +357,13 @@ func scanEngineBdevsOverDrpc(ctx context.Context, engine Engine, pbReq *ctlpb.Sc

c := seenCtrlrs[addr]

// Only minimal info provided in standard scan to enable result aggregation across
// homogeneous hosts.
engineRank, err := engine.GetRank()
if err != nil {
return nil, errors.Wrapf(err, "instance %d GetRank", engine.Index())
}

// Only provide minimal info in standard scan to enable result aggregation across
// homogeneous hosts.
nsd := &ctlpb.SmdDevice{
RoleBits: sd.RoleBits,
CtrlrNamespaceId: sd.CtrlrNamespaceId,
Expand All @@ -326,18 +379,29 @@ func scanEngineBdevsOverDrpc(ctx context.Context, engine Engine, pbReq *ctlpb.Sc
// Populate health if requested.
healthUpdated := false
if pbReq.Health && c.HealthStats == nil {
bhReq := &ctlpb.BioHealthReq{
DevUuid: sd.Uuid,
MetaSize: pbReq.MetaSize,
RdbSize: pbReq.RdbSize,
bhReq := &ctlpb.BioHealthReq{DevUuid: sd.Uuid}
if pbReq.Meta {
bhReq.MetaSize = pbReq.MetaSize
bhReq.RdbSize = pbReq.RdbSize
}
upd, err := populateCtrlrHealth(ctx, engine, bhReq, c,
hwprov.DefaultPCIeLinkStatsProvider())

chReq := ctrlrHealthReq{
engine: engine,
bhReq: bhReq,
ctrlr: c,
}
if pbReq.LinkStats {
// Add link stats to health if flag set.
chReq.linkStatsProv = linkStatsProv
}

healthUpdated, err = populateCtrlrHealth(ctx, chReq)
if err != nil {
return nil, err
}
healthUpdated = upd
}
// Used to update health with link stats, now redundant.
c.PciCfg = ""

// Populate usage data if requested.
if pbReq.Meta {
Expand Down Expand Up @@ -510,12 +574,20 @@ func smdQueryEngine(ctx context.Context, engine Engine, pbReq *ctlpb.SmdQueryReq
continue // Skip health query if UUID doesn't match requested.
}
if pbReq.IncludeBioHealth {
bhReq := &ctlpb.BioHealthReq{DevUuid: dev.Uuid}
if _, err := populateCtrlrHealth(ctx, engine, bhReq, dev.Ctrlr,
hwprov.DefaultPCIeLinkStatsProvider()); err != nil {
chReq := ctrlrHealthReq{
engine: engine,
bhReq: &ctlpb.BioHealthReq{DevUuid: dev.Uuid},
ctrlr: dev.Ctrlr,
linkStatsProv: linkStatsProv,
}

if _, err = populateCtrlrHealth(ctx, chReq); err != nil {
return nil, err
}
}
// Used to update health with link stats, now redundant.
dev.Ctrlr.PciCfg = ""

if pbReq.Uuid != "" && dev.Uuid == pbReq.Uuid {
rResp.Devices = []*ctlpb.SmdDevice{dev}
found = true
Expand Down
Loading

0 comments on commit c4cf4f7

Please sign in to comment.