Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-16687 control: Handle missing PCIe caps in storage query usage #15296

Merged
merged 7 commits into from
Oct 23, 2024
3 changes: 2 additions & 1 deletion src/control/lib/hardware/pciutils/bindings.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ var (
ErrMultiDevices = errors.New("want single device config got multiple")
ErrCfgNotTerminated = errors.New("device config content not new-line terminated")
ErrCfgMissing = errors.New("incomplete device config")
ErrNoPCIeCaps = errors.New("no pci-express capabilities found")
)

// api provides the PCIeLinkStatsProvider interface by exposing a concrete implementation of
Expand Down Expand Up @@ -150,7 +151,7 @@ func (ap *api) PCIeCapsFromConfig(cfgBytes []byte, dev *hardware.PCIDevice) erro
var cp *C.struct_pci_cap = C.pci_find_cap(pciDev, C.PCI_CAP_ID_EXP, C.PCI_CAP_NORMAL)

if cp == nil {
return errors.New("no pci-express capabilities found")
return ErrNoPCIeCaps
}

cpAddr := uint32(cp.addr)
Expand Down
49 changes: 49 additions & 0 deletions src/control/server/ctl_smd_rpc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"github.com/daos-stack/daos/src/control/common/test"
"github.com/daos-stack/daos/src/control/drpc"
"github.com/daos-stack/daos/src/control/lib/daos"
"github.com/daos-stack/daos/src/control/lib/hardware/pciutils"
"github.com/daos-stack/daos/src/control/lib/ranklist"
"github.com/daos-stack/daos/src/control/logging"
"github.com/daos-stack/daos/src/control/server/config"
Expand Down Expand Up @@ -88,6 +89,7 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) {
drpcResps map[int][]*mockDrpcResponse
harnessStopped bool
ioStopped bool
pciDevErr error
expResp *ctlpb.SmdQueryResp
expErr error
}{
Expand Down Expand Up @@ -658,6 +660,46 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) {
},
expErr: daos.FreeMemError,
},
"list-devices; with health; update link stats": {
req: &ctlpb.SmdQueryReq{
OmitPools: true,
Rank: uint32(ranklist.NilRank),
Uuid: test.MockUUID(1),
IncludeBioHealth: true,
},
drpcResps: map[int][]*mockDrpcResponse{
0: {
{
Message: &ctlpb.SmdDevResp{
Devices: []*ctlpb.SmdDevice{pbNormDev(0)},
},
},
},
1: {
{
Message: &ctlpb.SmdDevResp{
Devices: []*ctlpb.SmdDevice{
func() *ctlpb.SmdDevice {
sd := pbFaultDev(1)
sd.Ctrlr.PciCfg = "ABCD"
return sd
}(),
},
},
},
{
Message: &ctlpb.BioHealthResp{
Temperature: 1000000,
TempWarn: true,
},
},
},
},
// Prove mock link stats provider gets called when IncludeBioHealth
// flag is set and Ctrlr.PciCfg string is not empty.
pciDevErr: errors.New("link stats provider fail"),
expErr: errors.New("link stats provider fail"),
},
"ambiguous UUID": {
req: &ctlpb.SmdQueryReq{
Rank: uint32(ranklist.NilRank),
Expand All @@ -680,6 +722,13 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) {
log, buf := logging.NewTestLogger(t.Name())
defer test.ShowBufferOnFailure(t, buf)

linkStatsProv = &mockPCIeLinkStatsProvider{
pciDevErr: tc.pciDevErr,
}
defer func() {
linkStatsProv = pciutils.NewPCIeLinkStatsProvider()
}()

engineCount := len(tc.drpcResps)
if engineCount == 0 {
engineCount = 1
Expand Down
132 changes: 102 additions & 30 deletions src/control/server/instance_storage_rpc.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,14 @@ import (
)

var (
scanSmd = listSmdDevices
getCtrlrHealth = getBioHealth
// Function pointers to enable mocking.
scanSmd = listSmdDevices
scanHealth = getBioHealth
linkStatsProv = pciutils.NewPCIeLinkStatsProvider()

// Sentinel errors to enable comparison.
errEngineBdevScanEmptyDevList = errors.New("empty device list for engine instance")
errCtrlrHealthSkipped = errors.New("controller health update was skipped")
)

// newMntRet creates and populates SCM mount result.
Expand Down Expand Up @@ -168,13 +173,17 @@ func (ei *EngineInstance) StorageFormatSCM(ctx context.Context, force bool) (mRe
}

func addLinkInfoToHealthStats(prov hardware.PCIeLinkStatsProvider, pciCfg string, health *ctlpb.BioHealthResp) error {
if health == nil {
return errors.New("nil BioHealthResp")
}

// Convert byte-string to lspci-format.
sb := new(strings.Builder)
formatBytestring(pciCfg, sb)

pciDev := &hardware.PCIDevice{}
if err := prov.PCIeCapsFromConfig([]byte(sb.String()), pciDev); err != nil {
return err
return errors.Wrap(err, "pciutils lib")
}

// Copy link details from PCIDevice to health stats.
Expand Down Expand Up @@ -243,31 +252,74 @@ func publishLinkStatEvents(engine Engine, pciAddr string, stats *ctlpb.BioHealth
lastMaxWidthStr, lastWidthStr, stats.LinkPortId)
}

func populateCtrlrHealth(ctx context.Context, engine Engine, req *ctlpb.BioHealthReq, ctrlr *ctlpb.NvmeController, prov hardware.PCIeLinkStatsProvider) (bool, error) {
stateName := ctlpb.NvmeDevState_name[int32(ctrlr.DevState)]
if !ctrlr.CanSupplyHealthStats() {
engine.Debugf("skip fetching health stats on device %q in %q state",
ctrlr.PciAddr, stateName)
return false, nil
type ctrlrHealthReq struct {
meta bool
engine Engine
bhReq *ctlpb.BioHealthReq
ctrlr *ctlpb.NvmeController
linkStatsProv hardware.PCIeLinkStatsProvider
}

// Retrieve NVMe controller health statistics for those in an acceptable state. Return nil health
// resp if in a bad state.
func getCtrlrHealth(ctx context.Context, req ctrlrHealthReq) (*ctlpb.BioHealthResp, error) {
stateName := ctlpb.NvmeDevState_name[int32(req.ctrlr.DevState)]
if !req.ctrlr.CanSupplyHealthStats() {
req.engine.Debugf("skip fetching health stats on device %q in %q state",
req.ctrlr.PciAddr, stateName)
return nil, errCtrlrHealthSkipped
}

health, err := getCtrlrHealth(ctx, engine, req)
health, err := scanHealth(ctx, req.engine, req.bhReq)
if err != nil {
return false, errors.Wrapf(err, "retrieve health stats for %q (state %q)", ctrlr,
return nil, errors.Wrapf(err, "retrieve health stats for %q (state %q)", req.ctrlr,
stateName)
}

if ctrlr.PciCfg != "" {
if err := addLinkInfoToHealthStats(prov, ctrlr.PciCfg, health); err != nil {
return false, errors.Wrapf(err, "add link stats for %q", ctrlr)
return health, nil
}

// Add link state and capability information to input health statistics for the given controller
// then if successful publish events based on link statistic changes. Link updated health stats to
// controller.
func setCtrlrHealthWithLinkInfo(req ctrlrHealthReq, health *ctlpb.BioHealthResp) error {
err := addLinkInfoToHealthStats(req.linkStatsProv, req.ctrlr.PciCfg, health)
if err == nil {
publishLinkStatEvents(req.engine, req.ctrlr.PciAddr, health)
} else {
if errors.Cause(err) != pciutils.ErrNoPCIeCaps {
return errors.Wrapf(err, "add link stats for %q", req.ctrlr)
}
req.engine.Debugf("device %q not reporting PCIe capabilities", req.ctrlr.PciAddr)
}

return nil
}

// Update controller health statistics and include link info if required and available.
func populateCtrlrHealth(ctx context.Context, req ctrlrHealthReq) (bool, error) {
health, err := getCtrlrHealth(ctx, req)
if err != nil {
if err == errCtrlrHealthSkipped {
// Nothing to do.
return false, nil
}
publishLinkStatEvents(engine, ctrlr.PciAddr, health)
return false, errors.Wrap(err, "get ctrlr health")
}

if req.linkStatsProv == nil {
req.engine.Debugf("device %q skip adding link stats; nil provider",
req.ctrlr.PciAddr)
} else if req.ctrlr.PciCfg == "" {
req.engine.Debugf("device %q skip adding link stats; empty pci cfg",
req.ctrlr.PciAddr)
} else {
engine.Debugf("no pcie config space received for %q, skip add link stats", ctrlr)
if err = setCtrlrHealthWithLinkInfo(req, health); err != nil {
return false, errors.Wrap(err, "set ctrlr health")
}
}

ctrlr.HealthStats = health
ctrlr.PciCfg = ""
req.ctrlr.HealthStats = health
return true, nil
}

Expand Down Expand Up @@ -305,12 +357,13 @@ func scanEngineBdevsOverDrpc(ctx context.Context, engine Engine, pbReq *ctlpb.Sc

c := seenCtrlrs[addr]

// Only minimal info provided in standard scan to enable result aggregation across
// homogeneous hosts.
engineRank, err := engine.GetRank()
if err != nil {
return nil, errors.Wrapf(err, "instance %d GetRank", engine.Index())
}

// Only provide minimal info in standard scan to enable result aggregation across
// homogeneous hosts.
nsd := &ctlpb.SmdDevice{
RoleBits: sd.RoleBits,
CtrlrNamespaceId: sd.CtrlrNamespaceId,
Expand All @@ -326,18 +379,29 @@ func scanEngineBdevsOverDrpc(ctx context.Context, engine Engine, pbReq *ctlpb.Sc
// Populate health if requested.
healthUpdated := false
if pbReq.Health && c.HealthStats == nil {
bhReq := &ctlpb.BioHealthReq{
DevUuid: sd.Uuid,
MetaSize: pbReq.MetaSize,
RdbSize: pbReq.RdbSize,
bhReq := &ctlpb.BioHealthReq{DevUuid: sd.Uuid}
if pbReq.Meta {
bhReq.MetaSize = pbReq.MetaSize
bhReq.RdbSize = pbReq.RdbSize
}
upd, err := populateCtrlrHealth(ctx, engine, bhReq, c,
pciutils.NewPCIeLinkStatsProvider())

chReq := ctrlrHealthReq{
engine: engine,
bhReq: bhReq,
ctrlr: c,
}
if !pbReq.Meta {
// Add link stats to health only if health requested w/o usage flag.
kjacque marked this conversation as resolved.
Show resolved Hide resolved
chReq.linkStatsProv = linkStatsProv
}

healthUpdated, err = populateCtrlrHealth(ctx, chReq)
if err != nil {
return nil, err
}
healthUpdated = upd
}
// Used to update health with link stats, now redundant.
c.PciCfg = ""
kjacque marked this conversation as resolved.
Show resolved Hide resolved

// Populate usage data if requested.
if pbReq.Meta {
Expand Down Expand Up @@ -510,12 +574,20 @@ func smdQueryEngine(ctx context.Context, engine Engine, pbReq *ctlpb.SmdQueryReq
continue // Skip health query if UUID doesn't match requested.
}
if pbReq.IncludeBioHealth {
bhReq := &ctlpb.BioHealthReq{DevUuid: dev.Uuid}
if _, err := populateCtrlrHealth(ctx, engine, bhReq, dev.Ctrlr,
pciutils.NewPCIeLinkStatsProvider()); err != nil {
chReq := ctrlrHealthReq{
engine: engine,
bhReq: &ctlpb.BioHealthReq{DevUuid: dev.Uuid},
ctrlr: dev.Ctrlr,
linkStatsProv: linkStatsProv,
}

if _, err = populateCtrlrHealth(ctx, chReq); err != nil {
return nil, err
}
}
// Used to update health with link stats, now redundant.
dev.Ctrlr.PciCfg = ""
kjacque marked this conversation as resolved.
Show resolved Hide resolved

if pbReq.Uuid != "" && dev.Uuid == pbReq.Uuid {
rResp.Devices = []*ctlpb.SmdDevice{dev}
found = true
Expand Down
Loading
Loading