Skip to content

Commit

Permalink
DAOS-14181 control: Remove bdev scan cache
Browse files Browse the repository at this point in the history
Required-githooks: true

Signed-off-by: Tom Nabarro <[email protected]>
  • Loading branch information
tanabarr committed Oct 30, 2023
1 parent 1bd280f commit f2ae965
Show file tree
Hide file tree
Showing 20 changed files with 1,193 additions and 2,455 deletions.
2 changes: 2 additions & 0 deletions src/control/drpc/modules.go
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,8 @@ const (
MethodPoolUpgrade MgmtMethod = C.DRPC_METHOD_MGMT_POOL_UPGRADE
// MethodLedManage defines a method to manage a VMD device LED state
MethodLedManage MgmtMethod = C.DRPC_METHOD_MGMT_LED_MANAGE
// MethodNvmeDevs is a ModuleMgmt method
MethodNvmeDevs MgmtMethod = C.DRPC_METHOD_MGMT_NVME_LIST_DEVS
)

type srvMethod int32
Expand Down
4 changes: 2 additions & 2 deletions src/control/server/config/faults.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ func FaultConfigDuplicateScmDeviceList(curIdx, seenIdx int) *fault.Fault {
func FaultConfigScmDiffClass(curIdx, seenIdx int) *fault.Fault {
return serverConfigFault(
code.ServerConfigScmDiffClass,
fmt.Sprintf("the SCM class in I/O Engine %d is different from I/O Engine %d",
fmt.Sprintf("the SCM class in engine %d is different from engine %d",
curIdx, seenIdx),
"ensure that each I/O Engine has a single SCM tier with the same class and restart",
)
Expand All @@ -156,7 +156,7 @@ func FaultConfigScmDiffClass(curIdx, seenIdx int) *fault.Fault {
func FaultConfigOverlappingBdevDeviceList(curIdx, seenIdx int) *fault.Fault {
return serverConfigFault(
code.ServerConfigOverlappingBdevDeviceList,
fmt.Sprintf("the bdev_list value in I/O Engine %d overlaps with entries in server %d", curIdx, seenIdx),
fmt.Sprintf("the bdev_list value in engine %d overlaps with entries in engine %d", curIdx, seenIdx),
"ensure that each I/O Engine has a unique set of bdev_list entries and restart",
)
}
Expand Down
6 changes: 3 additions & 3 deletions src/control/server/config/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -739,7 +739,7 @@ func (cfg *Server) validateMultiEngineConfig(log logging.Logger) error {
seenValues := make(map[string]int)
seenScmSet := make(map[string]int)
seenBdevSet := make(map[string]int)
seenIdx := 0
seenIdx := -1
seenBdevCount := -1
seenTargetCount := -1
seenHelperStreamCount := -1
Expand Down Expand Up @@ -806,8 +806,8 @@ func (cfg *Server) validateMultiEngineConfig(log logging.Logger) error {
// Log error but don't fail in order to be lenient with unbalanced device
// counts in particular cases e.g. using different capacity SSDs or VMDs
// with different number of backing devices.
err := FaultConfigBdevCountMismatch(idx, bdevCount, seenIdx, seenBdevCount)
log.Noticef(err.Error())
e := FaultConfigBdevCountMismatch(idx, bdevCount, seenIdx, seenBdevCount)
log.Noticef(e.Error())
}
if seenTargetCount != -1 && engine.TargetCount != seenTargetCount {
return FaultConfigTargetCountMismatch(idx, engine.TargetCount, seenIdx,
Expand Down
29 changes: 25 additions & 4 deletions src/control/server/config/server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1617,14 +1617,15 @@ func TestServerConfig_validateMultiEngineConfig(t *testing.T) {
WithStorageClass("ram").
WithScmMountPoint("b"),
).
WithPinnedNumaNode(0).
WithPinnedNumaNode(1).
WithTargetCount(8)
}

for name, tc := range map[string]struct {
configA *engine.Config
configB *engine.Config
expErr error
expLog string
}{
"successful validation": {
configA: configA(),
Expand Down Expand Up @@ -1690,15 +1691,15 @@ func TestServerConfig_validateMultiEngineConfig(t *testing.T) {
AppendStorage(
storage.NewTierConfig().
WithStorageClass(storage.ClassNvme.String()).
WithBdevDeviceList(MockPCIAddr(1), MockPCIAddr(1)),
WithBdevDeviceList(MockPCIAddr(1), MockPCIAddr(2)),
),
configB: configB().
AppendStorage(
storage.NewTierConfig().
WithStorageClass(storage.ClassNvme.String()).
WithBdevDeviceList(MockPCIAddr(2), MockPCIAddr(2)),
WithBdevDeviceList(MockPCIAddr(2), MockPCIAddr(1)),
),
expErr: errors.New("valid PCI addresses"),
expErr: errors.New("engine 1 overlaps with entries in engine 0"),
},
"mismatched scm_class": {
configA: configA(),
Expand All @@ -1711,6 +1712,21 @@ func TestServerConfig_validateMultiEngineConfig(t *testing.T) {
),
expErr: FaultConfigScmDiffClass(1, 0),
},
"mismatched nr bdev_list": {
configA: configA().
AppendStorage(
storage.NewTierConfig().
WithStorageClass(storage.ClassNvme.String()).
WithBdevDeviceList(MockPCIAddr(1)),
),
configB: configB().
AppendStorage(
storage.NewTierConfig().
WithStorageClass(storage.ClassNvme.String()).
WithBdevDeviceList(MockPCIAddr(2), MockPCIAddr(3)),
),
expLog: "engine 1 has 2 but engine 0 has 1",
},
} {
t.Run(name, func(t *testing.T) {
log, buf := logging.NewTestLogger(t.Name())
Expand All @@ -1722,6 +1738,11 @@ func TestServerConfig_validateMultiEngineConfig(t *testing.T) {

gotErr := conf.Validate(log)
CmpErr(t, tc.expErr, gotErr)

if tc.expLog != "" {
hasEntry := strings.Contains(buf.String(), tc.expLog)
AssertTrue(t, hasEntry, "expected entries not found in log")
}
})
}
}
Expand Down
93 changes: 0 additions & 93 deletions src/control/server/ctl_storage.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,13 @@
package server

import (
"context"
"fmt"
"path/filepath"
"strings"

"github.com/dustin/go-humanize"
"github.com/pkg/errors"

"github.com/daos-stack/daos/src/control/common"
"github.com/daos-stack/daos/src/control/common/proto/ctl"
"github.com/daos-stack/daos/src/control/logging"
"github.com/daos-stack/daos/src/control/server/engine"
"github.com/daos-stack/daos/src/control/server/storage"
Expand Down Expand Up @@ -156,93 +153,3 @@ func (cs *ControlService) getScmUsage(ssr *storage.ScmScanResponse) (*storage.Sc

return &storage.ScmScanResponse{Namespaces: nss}, nil
}

// scanAssignedBdevs retrieves up-to-date NVMe controller info including
// health statistics and stored server meta-data. If I/O Engines are running
// then query is issued over dRPC as go-spdk bindings cannot be used to access
// controller claimed by another process. Only update info for controllers
// assigned to I/O Engines.
func (cs *ControlService) scanAssignedBdevs(ctx context.Context, nsps []*ctl.ScmNamespace, statsReq bool) (*storage.BdevScanResponse, error) {
instances := cs.harness.Instances()
ctrlrs := new(storage.NvmeControllers)

for _, ei := range instances {
if !ei.GetStorage().HasBlockDevices() {
continue
}

tsrs, err := ei.ScanBdevTiers()
if err != nil {
return nil, err
}

// Build slice of controllers in all tiers.
tierCtrlrs := make([]storage.NvmeController, 0)
msg := fmt.Sprintf("NVMe tiers for engine-%d:", ei.Index())
for _, tsr := range tsrs {
msg += fmt.Sprintf("\n\tTier-%d: %s", tsr.Tier, tsr.Result.Controllers)
for _, c := range tsr.Result.Controllers {
tierCtrlrs = append(tierCtrlrs, *c)
}
}
cs.log.Info(msg)

// If the engine is not running or we aren't interested in temporal
// statistics for the bdev devices then continue to next engine.
if !ei.IsReady() || !statsReq {
ctrlrs.Update(tierCtrlrs...)
continue
}

cs.log.Debugf("updating stats for %d bdev(s) on instance %d", len(tierCtrlrs),
ei.Index())

// DAOS-12750 Compute the maximal size of the metadata to allow the engine to fill
// the WallMeta field response. The maximal metadata (i.e. VOS index file) size
// should be equal to the SCM available size divided by the number of targets of the
// engine.
var md_size uint64
var rdb_size uint64
for _, nsp := range nsps {
mp := nsp.GetMount()
if mp == nil {
continue
}
if r, err := ei.GetRank(); err != nil || uint32(r) != mp.GetRank() {
continue
}

// NOTE DAOS-14223: This metadata size calculation won't necessarily match
// the meta blob size on SSD if --meta-size is specified in
// pool create command.
md_size = mp.GetUsableBytes() / uint64(ei.GetTargetCount())

engineCfg, err := cs.getEngineCfgFromScmNsp(nsp)
if err != nil {
return nil, errors.Wrap(err, "Engine with invalid configuration")
}
rdb_size, err = cs.getRdbSize(engineCfg)
if err != nil {
return nil, err
}
break
}

if md_size == 0 {
cs.log.Noticef("instance %d: no SCM space available for metadata", ei.Index)
}

// If engine is running and has claimed the assigned devices for
// each tier, iterate over scan results for each tier and send query
// over drpc to update controller details with current health stats
// and smd info.
updatedCtrlrs, err := ei.updateInUseBdevs(ctx, tierCtrlrs, md_size, rdb_size)
if err != nil {
return nil, errors.Wrapf(err, "instance %d: update online bdevs", ei.Index())
}

ctrlrs.Update(updatedCtrlrs...)
}

return &storage.BdevScanResponse{Controllers: *ctrlrs}, nil
}
Loading

0 comments on commit f2ae965

Please sign in to comment.