Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-14985 bio: Handle NVMe unplugged in list devs #13614

Merged
merged 8 commits into from
Jan 25, 2024
5 changes: 4 additions & 1 deletion src/bio/bio_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -460,7 +460,10 @@ alloc_ctrlr_info(uuid_t dev_id, char *dev_name, struct bio_dev_info *b_info)
D_ASSERT(b_info->bdi_ctrlr == NULL);

if (dev_name == NULL) {
D_DEBUG(DB_MGMT, "missing bdev device name, skipping ctrlr info fetch\n");
D_DEBUG(DB_MGMT,
"missing bdev device name for device " DF_UUID ", skipping ctrlr "
"info fetch\n",
DP_UUID(dev_id));
return 0;
}

Expand Down
27 changes: 23 additions & 4 deletions src/control/server/ctl_smd_rpc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ const (
devStateNew = ctlpb.NvmeDevState_NEW
devStateNormal = ctlpb.NvmeDevState_NORMAL
devStateFaulty = ctlpb.NvmeDevState_EVICTED
devStateUnplug = ctlpb.NvmeDevState_UNPLUGGED

ledStateIdentify = ctlpb.LedState_QUICK_BLINK
ledStateNormal = ctlpb.LedState_OFF
Expand Down Expand Up @@ -267,6 +268,15 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) {
LedState: ledStateFault,
},
},
{
Uuid: test.MockUUID(2),
TgtIds: []int32{},
Ctrlr: &ctlpb.NvmeController{
PciAddr: "0000:8b:00.0",
DevState: devStateUnplug,
LedState: ledStateUnknown,
},
},
},
},
},
Expand All @@ -276,7 +286,7 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) {
Message: &ctlpb.SmdDevResp{
Devices: []*ctlpb.SmdDevice{
{
Uuid: test.MockUUID(2),
Uuid: test.MockUUID(3),
TgtIds: []int32{0, 1, 2},
Ctrlr: &ctlpb.NvmeController{
PciAddr: "0000:da:00.0",
Expand All @@ -285,7 +295,7 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) {
},
},
{
Uuid: test.MockUUID(3),
Uuid: test.MockUUID(4),
TgtIds: []int32{3, 4, 5},
Ctrlr: &ctlpb.NvmeController{
PciAddr: "0000:db:00.0",
Expand Down Expand Up @@ -320,13 +330,22 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) {
LedState: ledStateFault,
},
},
{
Uuid: test.MockUUID(2),
TgtIds: []int32{},
Ctrlr: &ctlpb.NvmeController{
PciAddr: "0000:8b:00.0",
DevState: devStateUnplug,
LedState: ledStateUnknown,
},
},
},
Rank: uint32(0),
},
{
Devices: []*ctlpb.SmdDevice{
{
Uuid: test.MockUUID(2),
Uuid: test.MockUUID(3),
TgtIds: []int32{0, 1, 2},
Ctrlr: &ctlpb.NvmeController{
PciAddr: "0000:da:00.0",
Expand All @@ -335,7 +354,7 @@ func TestServer_CtlSvc_SmdQuery(t *testing.T) {
},
},
{
Uuid: test.MockUUID(3),
Uuid: test.MockUUID(4),
TgtIds: []int32{3, 4, 5},
Ctrlr: &ctlpb.NvmeController{
PciAddr: "0000:db:00.0",
Expand Down
10 changes: 8 additions & 2 deletions src/control/server/instance_storage_rpc.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//
// (C) Copyright 2020-2023 Intel Corporation.
// (C) Copyright 2020-2024 Intel Corporation.
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand Down Expand Up @@ -198,6 +198,12 @@ func scanEngineBdevsOverDrpc(ctx context.Context, engine Engine, pbReq *ctlpb.Sc
return nil, errors.Errorf("smd %q has no ctrlr ref", sd.Uuid)
}

if sd.Ctrlr.DevState != ctlpb.NvmeDevState_NORMAL &&
sd.Ctrlr.DevState != ctlpb.NvmeDevState_NEW &&
sd.Ctrlr.DevState != ctlpb.NvmeDevState_EVICTED {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will work, but if it were me I'd add a method to check this on NvmeController, e.g. sd.Ctrlr.Scannable(). That keeps the definition of what makes the controller scannable in one place, so it's easier to maintain when the inevitable future changes happen (e.g. to protobufs, etc).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can do this, probably should also do the same for the list of states that can be queried for health as that is different.

engine.Debugf("smd %q skip ctrlr %+v with bad state", sd.Uuid, sd.Ctrlr)
continue
}
addr := sd.Ctrlr.PciAddr

if _, exists := seenCtrlrs[addr]; !exists {
Expand All @@ -213,7 +219,7 @@ func scanEngineBdevsOverDrpc(ctx context.Context, engine Engine, pbReq *ctlpb.Sc

// Populate health if requested.
healthUpdated := false
if pbReq.Health {
if pbReq.Health && c.HealthStats == nil {
bhReq := &ctlpb.BioHealthReq{
DevUuid: sd.Uuid,
MetaSize: pbReq.MetaSize,
Expand Down
37 changes: 36 additions & 1 deletion src/control/server/instance_storage_rpc_test.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//
// (C) Copyright 2023 Intel Corporation.
// (C) Copyright 2023-2024 Intel Corporation.
//
// SPDX-License-Identifier: BSD-2-Clause-Patent
//
Expand All @@ -26,6 +26,15 @@ import (

func TestIOEngineInstance_bdevScanEngine(t *testing.T) {
c := storage.MockNvmeController(2)
withState := func(ctrlr *ctlpb.NvmeController, state ctlpb.NvmeDevState) *ctlpb.NvmeController {
ctrlr.DevState = state
ctrlr.HealthStats = nil
return ctrlr
}
withDevState := func(smd *ctlpb.SmdDevice, state ctlpb.NvmeDevState) *ctlpb.SmdDevice {
smd.Ctrlr.DevState = state
return smd
}
defSmdScanRes := func() *ctlpb.SmdDevResp {
return &ctlpb.SmdDevResp{
Devices: []*ctlpb.SmdDevice{
Expand Down Expand Up @@ -181,6 +190,32 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) {
State: new(ctlpb.ResponseState),
},
},
"scan over drpc; only ctrlrs with valid states shown": {
req: ctlpb.ScanNvmeReq{},
smdRes: &ctlpb.SmdDevResp{
Devices: proto.SmdDevices{
withDevState(proto.MockSmdDevice(storage.MockNvmeController(1), 1),
ctlpb.NvmeDevState_UNPLUGGED),
withDevState(proto.MockSmdDevice(storage.MockNvmeController(2), 2),
ctlpb.NvmeDevState_UNKNOWN),
withDevState(proto.MockSmdDevice(storage.MockNvmeController(3), 3),
ctlpb.NvmeDevState_NORMAL),
withDevState(proto.MockSmdDevice(storage.MockNvmeController(4), 4),
ctlpb.NvmeDevState_NEW),
withDevState(proto.MockSmdDevice(storage.MockNvmeController(5), 5),
ctlpb.NvmeDevState_EVICTED),
},
},
healthRes: healthRespWithUsage(),
expResp: &ctlpb.ScanNvmeResp{
Ctrlrs: proto.NvmeControllers{
withState(proto.MockNvmeController(3), ctlpb.NvmeDevState_NORMAL),
withState(proto.MockNvmeController(4), ctlpb.NvmeDevState_NEW),
withState(proto.MockNvmeController(5), ctlpb.NvmeDevState_EVICTED),
},
State: new(ctlpb.ResponseState),
},
},
"scan over drpc; with smd and health; missing ctrlr in smd": {
req: ctlpb.ScanNvmeReq{Meta: true, Health: true},
smdRes: func() *ctlpb.SmdDevResp {
Expand Down
76 changes: 35 additions & 41 deletions src/mgmt/srv_query.c
Original file line number Diff line number Diff line change
Expand Up @@ -338,11 +338,8 @@ ctrlr_reset_str_fields(Ctl__NvmeController *ctrlr)
static int
add_ctrlr_details(Ctl__NvmeController *ctrlr, struct bio_dev_info *dev_info)
{
int rc;
int rc = 0;

rc = copy_str2ctrlr(&ctrlr->pci_addr, dev_info->bdi_traddr);
if (rc != 0)
return rc;
rc = copy_str2ctrlr(&ctrlr->model, dev_info->bdi_ctrlr->model);
if (rc != 0)
return rc;
Expand All @@ -364,6 +361,32 @@ add_ctrlr_details(Ctl__NvmeController *ctrlr, struct bio_dev_info *dev_info)
ctrlr->model, ctrlr->serial, ctrlr->fw_rev, ctrlr->vendor_id, ctrlr->pci_dev_type,
ctrlr->socket_id);

/* Populate NVMe namespace id and capacity */

if (dev_info->bdi_ctrlr->nss == NULL) {
D_ERROR("nss not initialized in bio_dev_info");
return -DER_INVAL;
}
D_ASSERT(dev_info->bdi_ctrlr->nss->next == NULL);

/* When describing a SMD, only one NVMe namespace is relevant */
D_ALLOC_ARRAY(ctrlr->namespaces, 1);
if (ctrlr->namespaces == NULL) {
return -DER_NOMEM;
}
D_ALLOC_PTR(ctrlr->namespaces[0]);
if (ctrlr->namespaces[0] == NULL) {
return -DER_NOMEM;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[defect] ctrlr->namespaces leaked?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, thanks for spotting, fixed in the relevant free function

}
ctrlr->n_namespaces = 1;
ctl__nvme_controller__namespace__init(ctrlr->namespaces[0]);

ctrlr->namespaces[0]->id = dev_info->bdi_ctrlr->nss->id;
ctrlr->namespaces[0]->size = dev_info->bdi_ctrlr->nss->size;

D_DEBUG(DB_MGMT, "ns id/size: '%d' '%ld'\n", ctrlr->namespaces[0]->id,
ctrlr->namespaces[0]->size);

return 0;
}

Expand Down Expand Up @@ -426,12 +449,6 @@ ds_mgmt_smd_list_devs(Ctl__SmdDevResp *resp)
for (j = 0; j < dev_info->bdi_tgt_cnt; j++)
resp->devices[i]->tgt_ids[j] = dev_info->bdi_tgts[j];

if (dev_info->bdi_ctrlr == NULL) {
D_ERROR("ctrlr not initialized in bio_dev_info");
rc = -DER_INVAL;
break;
}

/* Populate NVMe controller details */

D_ALLOC_PTR(resp->devices[i]->ctrlr);
Expand All @@ -443,48 +460,25 @@ ds_mgmt_smd_list_devs(Ctl__SmdDevResp *resp)
/* Set string fields to NULL to allow D_FREE to work as expected on cleanup */
ctrlr_reset_str_fields(resp->devices[i]->ctrlr);

rc = add_ctrlr_details(resp->devices[i]->ctrlr, dev_info);
rc = copy_str2ctrlr(&resp->devices[i]->ctrlr->pci_addr, dev_info->bdi_traddr);
if (rc != 0)
break;

/* Populate NVMe namespace id and capacity */

if (dev_info->bdi_ctrlr->nss == NULL) {
D_ERROR("nss not initialized in bio_dev_info");
rc = -DER_INVAL;
break;
}
D_ASSERT(dev_info->bdi_ctrlr->nss->next == NULL);

/* When describing a SMD, only one NVMe namespace is relevant */
D_ALLOC_ARRAY(resp->devices[i]->ctrlr->namespaces, 1);
if (resp->devices[i]->ctrlr->namespaces == NULL) {
rc = -DER_NOMEM;
break;
}
D_ALLOC_PTR(resp->devices[i]->ctrlr->namespaces[0]);
if (resp->devices[i]->ctrlr->namespaces[0] == NULL) {
rc = -DER_NOMEM;
break;
if (dev_info->bdi_ctrlr != NULL) {
rc = add_ctrlr_details(resp->devices[i]->ctrlr, dev_info);
if (rc != 0)
break;
resp->devices[i]->ctrlr_namespace_id = dev_info->bdi_ctrlr->nss->id;
} else {
D_DEBUG(DB_MGMT, "ctrlr not initialized in bio_dev_info, unplugged?");
}
resp->devices[i]->ctrlr->n_namespaces = 1;
ctl__nvme_controller__namespace__init(resp->devices[i]->ctrlr->namespaces[0]);

resp->devices[i]->ctrlr->namespaces[0]->id = dev_info->bdi_ctrlr->nss->id;
resp->devices[i]->ctrlr->namespaces[0]->size = dev_info->bdi_ctrlr->nss->size;
resp->devices[i]->ctrlr_namespace_id = dev_info->bdi_ctrlr->nss->id;

D_DEBUG(DB_MGMT, "ns id/size: '%d' '%ld'\n",
resp->devices[i]->ctrlr->namespaces[0]->id,
resp->devices[i]->ctrlr->namespaces[0]->size);

/* Populate NVMe device state */

if ((dev_info->bdi_flags & NVME_DEV_FL_PLUGGED) == 0) {
resp->devices[i]->ctrlr->dev_state = CTL__NVME_DEV_STATE__UNPLUGGED;
goto next_dev;
}

if ((dev_info->bdi_flags & NVME_DEV_FL_FAULTY) != 0)
resp->devices[i]->ctrlr->dev_state = CTL__NVME_DEV_STATE__EVICTED;
else if ((dev_info->bdi_flags & NVME_DEV_FL_INUSE) == 0)
Expand Down