Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-16838 control: Fix dmg storage query usage with emulated NVMe #15545

Merged
merged 3 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 24 additions & 34 deletions src/control/server/ctl_storage_rpc.go
Original file line number Diff line number Diff line change
Expand Up @@ -362,23 +362,6 @@ func (cs *ControlService) scanScm(ctx context.Context, req *ctlpb.ScanScmReq) (*
return resp, nil
}

// Returns the engine configuration managing the given NVMe controller
func (cs *ControlService) getEngineCfgFromNvmeCtl(nc *ctlpb.NvmeController) (*engine.Config, error) {
pciAddrStr, err := ctrlrToPciStr(nc)
if err != nil {
return nil, err
}

for index := range cs.srvCfg.Engines {
if findBdevTier(pciAddrStr, cs.srvCfg.Engines[index].Storage.Tiers) != nil {
return cs.srvCfg.Engines[index], nil
}
}

return nil, errors.Errorf("unknown PCI device, scanned ctrlr %q not found in cfg",
pciAddrStr)
}

// Returns the engine configuration managing the given SCM name-space
func (cs *ControlService) getEngineCfgFromScmNsp(nsp *ctlpb.ScmNamespace) (*engine.Config, error) {
mountPoint := nsp.GetMount().Path
Expand Down Expand Up @@ -531,7 +514,7 @@ func getClusterCount(sizeBytes uint64, tgtCount int, clusterSize uint64) uint64
return clusterCount * uint64(tgtCount)
}

func (cs *ControlService) getMetaClusterCount(engineCfg *engine.Config, devToAdjust deviceToAdjust) (subtrClusterCount uint64) {
func (cs *ControlService) getMetaClusterCount(devToAdjust deviceToAdjust) (subtrClusterCount uint64) {
dev := devToAdjust.ctlr.GetSmdDevices()[devToAdjust.idx]
clusterSize := uint64(dev.GetClusterSize())
// Calculate MD cluster overhead based on the number of targets allocated to the device
Expand All @@ -540,15 +523,21 @@ func (cs *ControlService) getMetaClusterCount(engineCfg *engine.Config, devToAdj

if dev.GetRoleBits()&storage.BdevRoleMeta != 0 {
clusterCount := getClusterCount(dev.GetMetaSize(), devTgtCount, clusterSize)
cs.log.Tracef("Removing %d Metadata clusters (cluster size: %d, dev tgts: %d) from the usable size of the SMD device %s (rank %d, ctlr %s): ",
clusterCount, clusterSize, devTgtCount, dev.GetUuid(), devToAdjust.rank, devToAdjust.ctlr.GetPciAddr())
cs.log.Tracef("Removing %d Metadata clusters (meta_size: %s, cluster size: %s, dev tgts: %d - "+
"%s) from the usable size of the SMD device %s (rank %d, ctlr %q): ", clusterCount,
humanize.IBytes(dev.GetMetaSize()), humanize.IBytes(clusterSize), devTgtCount,
humanize.IBytes(clusterSize*clusterCount), dev.GetUuid(), devToAdjust.rank,
devToAdjust.ctlr.GetPciAddr())
subtrClusterCount += clusterCount
}

if dev.GetRoleBits()&storage.BdevRoleWAL != 0 {
clusterCount := getClusterCount(dev.GetMetaWalSize(), devTgtCount, clusterSize)
cs.log.Tracef("Removing %d Metadata WAL clusters (cluster size: %d, dev tgts: %d) from the usable size of the SMD device %s (rank %d, ctlr %s): ",
clusterCount, clusterSize, devTgtCount, dev.GetUuid(), devToAdjust.rank, devToAdjust.ctlr.GetPciAddr())
cs.log.Tracef("Removing %d Metadata WAL clusters (meta_wal_size: %s, cluster size: %s, "+
"dev tgts: %d - %s) from the usable size of the SMD device %s (rank %d, ctlr %q): ",
clusterCount, humanize.IBytes(dev.GetMetaWalSize()), humanize.IBytes(clusterSize),
devTgtCount, humanize.IBytes(clusterSize*clusterCount), dev.GetUuid(), devToAdjust.rank,
devToAdjust.ctlr.GetPciAddr())
subtrClusterCount += clusterCount
}

Expand All @@ -558,15 +547,21 @@ func (cs *ControlService) getMetaClusterCount(engineCfg *engine.Config, devToAdj

if dev.GetRoleBits()&storage.BdevRoleMeta != 0 {
clusterCount := getClusterCount(dev.GetRdbSize(), 1, clusterSize)
cs.log.Tracef("Removing %d RDB clusters (cluster size: %d) from the usable size of the SMD device %s (rank %d, ctlr %s)",
clusterCount, clusterSize, dev.GetUuid(), devToAdjust.rank, devToAdjust.ctlr.GetPciAddr())
cs.log.Tracef("Removing %d RDB clusters (rdb_size: %s, cluster size: %s - %s) from the usable "+
"size of the SMD device %s (rank %d, ctlr %q): ", clusterCount,
humanize.IBytes(dev.GetRdbSize()), humanize.IBytes(clusterSize),
humanize.IBytes(clusterSize*clusterCount), dev.GetUuid(), devToAdjust.rank,
devToAdjust.ctlr.GetPciAddr())
subtrClusterCount += clusterCount
}

if dev.GetRoleBits()&storage.BdevRoleWAL != 0 {
clusterCount := getClusterCount(dev.GetRdbWalSize(), 1, clusterSize)
cs.log.Tracef("Removing %d RDB WAL clusters (cluster size: %d) from the usable size of the SMD device %s (rank %d, ctlr %s)",
clusterCount, clusterSize, dev.GetUuid(), devToAdjust.rank, devToAdjust.ctlr.GetPciAddr())
cs.log.Tracef("Removing %d RDB WAL clusters (rdb_size: %s, cluster size: %s - %s) from the usable "+
"size of the SMD device %s (rank %d, ctlr %q): ", clusterCount,
humanize.IBytes(dev.GetRdbWalSize()), humanize.IBytes(clusterSize),
humanize.IBytes(clusterSize*clusterCount), dev.GetUuid(), devToAdjust.rank,
devToAdjust.ctlr.GetPciAddr())
subtrClusterCount += clusterCount
}

Expand All @@ -578,12 +573,6 @@ func (cs *ControlService) getMetaClusterCount(engineCfg *engine.Config, devToAdj
func (cs *ControlService) adjustNvmeSize(resp *ctlpb.ScanNvmeResp) {
devsStat := make(map[uint32]*deviceSizeStat, 0)
for _, ctlr := range resp.GetCtrlrs() {
engineCfg, err := cs.getEngineCfgFromNvmeCtl(ctlr)
if err != nil {
cs.log.Noticef("Skipping NVME controller %s: %s", ctlr.GetPciAddr(), err.Error())
continue
}

for idx, dev := range ctlr.GetSmdDevices() {
rank := dev.GetRank()
devTgtCount := getSmdTgtCount(cs.log, dev)
Expand Down Expand Up @@ -612,7 +601,8 @@ func (cs *ControlService) adjustNvmeSize(resp *ctlpb.ScanNvmeResp) {
}

cs.log.Tracef("Initial available size of SMD device %s (rank %d, ctlr %s): %s (%d bytes)",
dev.GetUuid(), rank, ctlr.GetPciAddr(), humanize.IBytes(dev.GetAvailBytes()), dev.GetAvailBytes())
dev.GetUuid(), rank, ctlr.GetPciAddr(), humanize.IBytes(dev.GetAvailBytes()),
dev.GetAvailBytes())

clusterSize := uint64(dev.GetClusterSize())
availBytes := (dev.GetAvailBytes() / clusterSize) * clusterSize
Expand All @@ -638,7 +628,7 @@ func (cs *ControlService) adjustNvmeSize(resp *ctlpb.ScanNvmeResp) {
continue
}

subtrClusterCount := cs.getMetaClusterCount(engineCfg, devToAdjust)
subtrClusterCount := cs.getMetaClusterCount(devToAdjust)
if subtrClusterCount >= dataClusterCount {
cs.log.Debugf("No more usable space in SMD device %s (rank %d, ctlr %s)",
dev.GetUuid(), rank, ctlr.GetPciAddr())
Expand Down
101 changes: 0 additions & 101 deletions src/control/server/ctl_storage_rpc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3974,107 +3974,6 @@ func TestServer_CtlSvc_adjustScmSize(t *testing.T) {
}
}

func TestServer_CtlSvc_getEngineCfgFromNvmeCtl(t *testing.T) {
type dataInput struct {
tierCfgs storage.TierConfigs
nvmeCtlr *ctl.NvmeController
}
type expectedOutput struct {
res bool
msg string
}

newTierCfgs := func(tierCfgsSize int32) storage.TierConfigs {
tierCfgs := make(storage.TierConfigs, tierCfgsSize)
for idx := range tierCfgs {
tierCfgs[idx] = storage.NewTierConfig().
WithStorageClass(storage.ClassNvme.String()).
WithBdevDeviceList(test.MockPCIAddr(int32(idx + 1)))
}

return tierCfgs
}

for name, tc := range map[string]struct {
input dataInput
output expectedOutput
}{
"find NVME Ctlr": {
input: dataInput{
tierCfgs: newTierCfgs(5),
nvmeCtlr: &ctl.NvmeController{
PciAddr: test.MockPCIAddr(3),
},
},
output: expectedOutput{res: true},
},
"not find NVME Ctlr": {
input: dataInput{
tierCfgs: newTierCfgs(5),
nvmeCtlr: &ctl.NvmeController{
PciAddr: test.MockPCIAddr(13),
},
},
output: expectedOutput{
res: false,
msg: "unknown PCI device",
},
},
"find VMD device": {
input: dataInput{
tierCfgs: storage.TierConfigs{
storage.NewTierConfig().
WithStorageClass(storage.ClassNvme.String()).
WithBdevDeviceList("0000:04:06.3"),
},
nvmeCtlr: &ctl.NvmeController{
PciAddr: "040603:02:00.0",
},
},
output: expectedOutput{res: true},
},
"Invalid address": {
input: dataInput{
tierCfgs: storage.TierConfigs{
storage.NewTierConfig().
WithStorageClass(storage.ClassNvme.String()).
WithBdevDeviceList("0000:04:06.3"),
},
nvmeCtlr: &ctl.NvmeController{
PciAddr: "666",
},
},
output: expectedOutput{
res: false,
msg: "Invalid PCI address",
},
},
} {
t.Run(name, func(t *testing.T) {
log, buf := logging.NewTestLogger(t.Name())
defer test.ShowBufferOnFailure(t, buf)

engineCfg := engine.MockConfig().WithStorage(tc.input.tierCfgs...)
serverCfg := config.DefaultServer().WithEngines(engineCfg)
cs := mockControlService(t, log, serverCfg, nil, nil, nil)

ec, err := cs.getEngineCfgFromNvmeCtl(tc.input.nvmeCtlr)

if tc.output.res {
test.AssertEqual(t, engineCfg, ec,
fmt.Sprintf("Invalid engine config: want=%v got=%v", engineCfg, ec))
return
}

test.AssertEqual(t, (*engine.Config)(nil), ec,
fmt.Sprintf("Invalid engine config: wait nil"))
test.AssertTrue(t,
strings.Contains(err.Error(), tc.output.msg),
fmt.Sprintf("Invalid error message: %q not contains %q", err, tc.output.msg))
})
}
}

func TestServer_CtlSvc_getEngineCfgFromScmNsp(t *testing.T) {
type dataInput struct {
tierCfgs storage.TierConfigs
Expand Down
14 changes: 11 additions & 3 deletions src/control/server/instance_storage_rpc.go
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,7 @@ func scanEngineBdevsOverDrpc(ctx context.Context, engine Engine, pbReq *ctlpb.Sc
if scanSmdResp == nil {
return nil, errors.New("nil smd scan resp")
}
engine.Tracef("smd scan devices: %+v", scanSmdResp.Devices)

// Re-link SMD devices inside NVMe controller structures and populate scan response.

Expand All @@ -340,12 +341,16 @@ func scanEngineBdevsOverDrpc(ctx context.Context, engine Engine, pbReq *ctlpb.Sc
}
seenCtrlrs := make(map[string]*ctlpb.NvmeController)

for _, sd := range scanSmdResp.Devices {
for i, sd := range scanSmdResp.Devices {
if sd.Ctrlr == nil {
return nil, errors.Errorf("smd %q has no ctrlr ref", sd.Uuid)
}

addr := sd.Ctrlr.PciAddr
if addr == "" {
// Mock identifier for emulated NVMe mode where devices have no PCI-address.
addr = fmt.Sprintf("0000:00:0.%d", i)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My only question is whether this could possibly conflict with a real device, since it is used in the seenCtrlrs map. I wonder if it would be better to use a "name" which could be either a PCI address for the real device, or the file name for emulated nvme.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In DPDK, PCI_ANY_ID is defined as 0xffff: https://doc.dpdk.org/api/rte__pci_8h.html#a53aca768a081fcf56089353d805ab77c

That also seems to match the kernel: https://elixir.bootlin.com/linux/v6.12.1/source/include/linux/mod_devicetable.h#L18

Might be better to use that value, as there is zero possibility of conflicting with a real vendor ID that way?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so are you suggesting FFFF:00:0.%d as the format string?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think so, if the goal is to avoid any conflict with a device that might be found via the topology scan. I can't find an official PCI spec to cite, but from what I found about how linux and DPDK do things, that value is a "can't happen" value, i.e. there's no way that a valid device could show up with that vendor ID.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

aren't we getting confused here between PCI-address and PCI-ID? https://wiki.debian.org/HowToIdentifyADevice/PCI

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the first segment of the PCI address is the domain not the vendor ID afais

Copy link
Contributor Author

@tanabarr tanabarr Dec 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've improved the resilience against collisions by using 0xffff PCI-domain segment and 0xf func value (neither would be used in a "real" NVMe device address as domain segment is either 0x10000 or 0x0000 in almost all allocated addresses and BDF func value is 0-7). Reference: https://dottedmag.net/blog/pci-basics/ .

}

if _, exists := seenCtrlrs[addr]; !exists {
c := new(ctlpb.NvmeController)
Expand Down Expand Up @@ -460,6 +465,7 @@ func bdevScanEngineAssigned(ctx context.Context, engine Engine, req *ctlpb.ScanN
return scanEngineBdevsOverDrpc(ctx, engine, req)
}

// Accommodate for VMD backing devices and emulated NVMe (AIO).
func getEffCtrlrCount(ctrlrs []*ctlpb.NvmeController) (int, error) {
pas := hardware.MustNewPCIAddressSet()
for _, c := range ctrlrs {
Expand All @@ -471,11 +477,13 @@ func getEffCtrlrCount(ctrlrs []*ctlpb.NvmeController) (int, error) {
if npas, err := pas.BackingToVMDAddresses(); err != nil {
return 0, err
} else {
pas = npas
return npas.Len(), nil
}
}

return pas.Len(), nil
// Return inputted number of controllers rather than number of parsed addresses to cater for
// the case of emulated NVMe where there will be no valid PCI address.
return len(ctrlrs), nil
}

// bdevScanEngine calls either in to the private engine storage provider to scan bdevs if engine process
Expand Down
83 changes: 81 additions & 2 deletions src/control/server/instance_storage_rpc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ func (mp *mockPCIeLinkStatsProvider) PCIeCapsFromConfig(cfgBytes []byte, dev *ha
return nil
}

func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
func TestServer_populateCtrlrHealth(t *testing.T) {
healthWithLinkStats := func(maxSpd, spd float32, maxWdt, wdt uint32) *ctlpb.BioHealthResp {
bhr := proto.MockNvmeHealth()
bhr.LinkMaxSpeed = maxSpd
Expand Down Expand Up @@ -459,7 +459,7 @@ func TestIOEngineInstance_populateCtrlrHealth(t *testing.T) {
}
}

func TestIOEngineInstance_bdevScanEngine(t *testing.T) {
func TestServer_bdevScanEngine(t *testing.T) {
c := storage.MockNvmeController(2)
withState := func(ctrlr *ctlpb.NvmeController, state ctlpb.NvmeDevState) *ctlpb.NvmeController {
ctrlr.DevState = state
Expand Down Expand Up @@ -793,6 +793,85 @@ func TestIOEngineInstance_bdevScanEngine(t *testing.T) {
// Prove link stat provider gets called without Meta flag.
expErr: errors.New("link stats provider fail"),
},
"scan over drpc; aio file emulated nvme; no pci addresses": {
smdRes: &ctlpb.SmdDevResp{
Devices: []*ctlpb.SmdDevice{
{
Uuid: "b80b4653-af58-47b3-aa3b-8ad12965f440",
TgtIds: []int32{
1024, 1024, 0, 0, 0, 4, 4, 4, 8, 8, 8, 12,
12, 12,
},
RoleBits: 7,
Ctrlr: &ctlpb.NvmeController{
DevState: ctlpb.NvmeDevState_NORMAL,
},
},
{
Uuid: "3c7a8e22-38c0-4e6f-8776-d703d049ae6f",
TgtIds: []int32{
1024, 1024, 1, 1, 1, 5, 5, 5, 9, 9, 9, 13,
13, 13,
},
RoleBits: 7,
Ctrlr: &ctlpb.NvmeController{
DevState: ctlpb.NvmeDevState_NORMAL,
},
},
{
Uuid: "40ba7b18-3a0e-4d68-9e8f-c4fc556eb506",
TgtIds: []int32{
1024, 1024, 2, 2, 2, 6, 6, 6, 10, 10, 10,
14, 14, 14,
},
RoleBits: 7,
Ctrlr: &ctlpb.NvmeController{
DevState: ctlpb.NvmeDevState_NORMAL,
},
},
{
Uuid: "d78c849f-fd9e-4a20-9746-b0335e3618da",
TgtIds: []int32{
1024, 1024, 3, 3, 3, 7, 7, 7, 11, 11, 11,
15, 15, 15,
},
RoleBits: 7,
Ctrlr: &ctlpb.NvmeController{
DevState: ctlpb.NvmeDevState_NORMAL,
},
},
},
},
expResp: &ctlpb.ScanNvmeResp{
Ctrlrs: proto.NvmeControllers{
&ctlpb.NvmeController{
DevState: ctlpb.NvmeDevState_NORMAL,
SmdDevices: []*ctlpb.SmdDevice{
{RoleBits: 7},
},
},
&ctlpb.NvmeController{
DevState: ctlpb.NvmeDevState_NORMAL,
SmdDevices: []*ctlpb.SmdDevice{
{RoleBits: 7},
},
},
&ctlpb.NvmeController{
DevState: ctlpb.NvmeDevState_NORMAL,
SmdDevices: []*ctlpb.SmdDevice{
{RoleBits: 7},
},
},
&ctlpb.NvmeController{
DevState: ctlpb.NvmeDevState_NORMAL,
SmdDevices: []*ctlpb.SmdDevice{
{RoleBits: 7},
},
},
},
State: new(ctlpb.ResponseState),
},
},
} {
t.Run(name, func(t *testing.T) {
log, buf := logging.NewTestLogger(t.Name())
Expand Down
4 changes: 2 additions & 2 deletions src/control/server/storage/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -483,8 +483,8 @@ func (tcs TierConfigs) AssignBdevTierRoles(extMetadataPath string) error {
if scs[0].Class == ClassDcpm {
return errors.New("external metadata path for md-on-ssd invalid with dcpm scm-class")
}
// Skip role assignment and validation if no real NVMe tiers exist.
if !tcs.HaveRealNVMe() {
// Skip role assignment and validation if no bdev tiers exist.
if !tcs.HaveBdevs() {
return nil
}

Expand Down
Loading