Skip to content

Commit

Permalink
Nvme resiliency read from sysfs instead of nvme commands
Browse files Browse the repository at this point in the history
Signed-off-by: kj-netapp <[email protected]>
  • Loading branch information
kj-netapp authored Jan 10, 2025
1 parent ffa2375 commit a9f98f3
Show file tree
Hide file tree
Showing 10 changed files with 485 additions and 74 deletions.
4 changes: 2 additions & 2 deletions frontend/csi/node_server.go
Original file line number Diff line number Diff line change
Expand Up @@ -2517,9 +2517,10 @@ func (p *Plugin) nodeUnstageNVMeVolume(
disconnect := p.nvmeHandler.RemovePublishedNVMeSession(&publishedNVMeSessions, publishInfo.NVMeSubsystemNQN,
publishInfo.NVMeNamespaceUUID)

nvmeSubsys := p.nvmeHandler.NewNVMeSubsystem(ctx, publishInfo.NVMeSubsystemNQN)
// Get the device using 'nvme-cli' commands. Flush the device IOs.
// Proceed further with unstage flow, if device is not found.
nvmeDev, err := p.nvmeHandler.NewNVMeDevice(ctx, publishInfo.NVMeNamespaceUUID)
nvmeDev, err := nvmeSubsys.GetNVMeDevice(ctx, publishInfo.NVMeNamespaceUUID)
if err != nil && !errors.IsNotFoundError(err) {
return nil, fmt.Errorf("failed to get NVMe device; %v", err)
}
Expand Down Expand Up @@ -2586,7 +2587,6 @@ func (p *Plugin) nodeUnstageNVMeVolume(
}

// Get the number of namespaces associated with the subsystem
nvmeSubsys := p.nvmeHandler.NewNVMeSubsystem(ctx, publishInfo.NVMeSubsystemNQN)
numNs, err := nvmeSubsys.GetNamespaceCount(ctx)
if err != nil {
Logc(ctx).WithField(
Expand Down
31 changes: 31 additions & 0 deletions mocks/mock_utils/mock_filesystem/mock_filesystem_client.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

31 changes: 31 additions & 0 deletions utils/filesystem/filesystem.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ type Filesystem interface {
ctx context.Context, path string,
) (available, capacity, usage, inodes, inodesFree, inodesUsed int64, err error)
GetUnmountPath(ctx context.Context, trackingInfo *models.VolumeTrackingInfo) (string, error)
ScanFile(filename string) ([]byte, error)
ScanDir(path string) ([]os.FileInfo, error)
}

type Mount interface {
Expand Down Expand Up @@ -340,3 +342,32 @@ func (f *FSClient) DeleteFile(ctx context.Context, filepath, fileDescription str

return filepath, nil
}

func (f *FSClient) ScanFile(filename string) ([]byte, error) {
fs := afero.NewOsFs()

file, err := fs.Open(filename)
if err != nil {
fmt.Println("Failed to open file:", err)
return nil, err
}
defer file.Close()

data, err := afero.ReadAll(file)
if err != nil {
fmt.Println("Failed to read file:", err)
return nil, err
}
return data, nil
}

func (f *FSClient) ScanDir(path string) ([]os.FileInfo, error) {
fs := afero.NewOsFs()

dirEntries, err := afero.ReadDir(fs, path)
if err != nil {
fmt.Println("Failed to read directory:", err)
return nil, err
}
return dirEntries, nil
}
3 changes: 2 additions & 1 deletion utils/filesystem/filesystem_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ func (f *FSClient) getFilesystemSize(ctx context.Context, _ string) (int64, erro
}

func (f *FSClient) GetFilesystemStats(ctx context.Context, path string) (int64, int64, int64, int64, int64, int64,
error) {
error,
) {
Logc(ctx).Debug(">>>> filesystem_windows.GetFilesystemStats")
defer Logc(ctx).Debug("<<<< filesystem_windows.GetFilesystemStats")

Expand Down
90 changes: 29 additions & 61 deletions utils/nvme.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,47 +14,25 @@ import (

. "github.com/netapp/trident/logging"
"github.com/netapp/trident/utils/devices/luks"
"github.com/netapp/trident/utils/errors"
"github.com/netapp/trident/utils/exec"
"github.com/netapp/trident/utils/filesystem"
"github.com/netapp/trident/utils/models"
)

const NVMeAttachTimeout = 20 * time.Second

var fsClient = filesystem.New(mountClient)

// getNVMeSubsystem returns the NVMe subsystem details.
func getNVMeSubsystem(ctx context.Context, subsysNqn string) (*NVMeSubsystem, error) {
Logc(ctx).Debug(">>>> nvme.getNVMeSubsystem")
defer Logc(ctx).Debug("<<<< nvme.getNVMeSubsystem")

subsys, err := GetNVMeSubsystemList(ctx)
if err != nil {
Logc(ctx).WithField("Error", err).Errorf("Failed to get subsystem list: %v", err)
return nil, err
}

// Getting current subsystem details.
for _, sub := range subsys.Subsystems {
if sub.NQN == subsysNqn {
return &sub, nil
}
}

return nil, fmt.Errorf("couldn't find subsystem %s", subsysNqn)
}
var fsClient = *filesystem.New(mountClient)

// updatePaths updates the paths with the current state of the subsystem on the k8s node.
func (s *NVMeSubsystem) updatePaths(ctx context.Context) error {
// Getting current state of subsystem on the k8s node.
sub, err := getNVMeSubsystem(ctx, s.NQN)
paths, err := GetNVMeSubsystemPaths(ctx, fsClient, s.Name)
if err != nil {
Logc(ctx).WithField("Error", err).Errorf("Failed to update subsystem paths: %v", err)
return fmt.Errorf("failed to update subsystem paths: %v", err)
}

s.Paths = sub.Paths
s.Paths = paths

return nil
}
Expand Down Expand Up @@ -137,43 +115,38 @@ func (s *NVMeSubsystem) Disconnect(ctx context.Context) error {

// GetNamespaceCount returns the number of namespaces mapped to the subsystem.
func (s *NVMeSubsystem) GetNamespaceCount(ctx context.Context) (int, error) {
var combinedError error

credibility := false
for _, path := range s.Paths {
count, err := GetNamespaceCountForSubsDevice(ctx, "/dev/"+path.Name)
if err != nil {
Logc(ctx).WithField("Error", err).Warnf("Failed to get namespace count: %v", err)
combinedError = multierr.Append(combinedError, err)
continue
if path.State == "live" {
credibility = true
break
}
}

return count, nil
if !credibility {
return 0, fmt.Errorf("nvme paths are down, couldn't get the number of namespaces")
}

count, err := GetNVMeDeviceCountAt(ctx, s.FS, s.Name)
if err != nil {
Logc(ctx).Errorf("Failed to get namespace count: %v", err)
return 0, err
}

// Couldn't find any sessions, so no namespaces are attached to this subsystem.
// But if there was error getting the number of namespaces from all the paths, return error.
return 0, combinedError
return count, nil
}

// getNVMeDevice returns the NVMe device corresponding to nsPath namespace.
func getNVMeDevice(ctx context.Context, nsUUID string) (*NVMeDevice, error) {
Logc(ctx).Debug(">>>> nvme.getNVMeDevice")
defer Logc(ctx).Debug("<<<< nvme.getNVMeDevice")
func (s *NVMeSubsystem) GetNVMeDevice(ctx context.Context, nsUUID string) (NVMeDeviceInterface, error) {
Logc(ctx).Debug(">>>> nvme.GetNVMeDevice")
defer Logc(ctx).Debug("<<<< nvme.GetNVMeDevice")

dList, err := GetNVMeDeviceList(ctx)
devInterface, err := GetNVMeDeviceAt(ctx, s.Name, nsUUID)
if err != nil {
return nil, fmt.Errorf("failed to get device: %v", err)
}

for _, dev := range dList.Devices {
if dev.UUID == nsUUID {
Logc(ctx).Debugf("Device found: %v.", dev)
return &dev, nil
}
Logc(ctx).Errorf("Failed to get NVMe device, %v", err)
return nil, err
}

Logc(ctx).WithField("nsUUID", nsUUID).Debug("No device found for this Namespace.")
return nil, errors.NotFoundError("no device found for the given namespace %v", nsUUID)
return devInterface, nil
}

// GetPath returns the device path where we mount the filesystem in NodePublish.
Expand Down Expand Up @@ -203,7 +176,7 @@ func (d *NVMeDevice) FlushDevice(ctx context.Context, ignoreErrors, force bool)

// IsNil returns true if Device and NamespacePath are not set.
func (d *NVMeDevice) IsNil() bool {
if d == nil || (d.Device == "" && d.NamespacePath == "") {
if d == nil || d.Device == "" {
return true
}
return false
Expand All @@ -214,20 +187,15 @@ func NewNVMeHandler() NVMeInterface {
return &NVMeHandler{}
}

// NewNVMeDevice returns new NVMe device
func (nh *NVMeHandler) NewNVMeDevice(ctx context.Context, nsUUID string) (NVMeDeviceInterface, error) {
return getNVMeDevice(ctx, nsUUID)
}

// NewNVMeSubsystem returns NVMe subsystem object. Even if a subsystem is not connected to the k8s node,
// this function returns a minimal NVMe subsystem object.
func (nh *NVMeHandler) NewNVMeSubsystem(ctx context.Context, subsNqn string) NVMeSubsystemInterface {
sub, err := getNVMeSubsystem(ctx, subsNqn)
sub, err := GetNVMeSubsystem(ctx, fsClient, subsNqn)
if err != nil {
Logc(ctx).WithField("Error", err).Warnf("Failed to get subsystem: %v; returning minimal subsystem", err)
return &NVMeSubsystem{NQN: subsNqn}
}
return sub
return &sub
}

// GetHostNqn returns the NQN of the k8s node.
Expand Down Expand Up @@ -295,7 +263,7 @@ func AttachNVMeVolume(
}
}

nvmeDev, err := nvmeHandler.NewNVMeDevice(ctx, publishInfo.NVMeNamespaceUUID)
nvmeDev, err := nvmeSubsys.GetNVMeDevice(ctx, publishInfo.NVMeNamespaceUUID)
if err != nil {
return err
}
Expand Down Expand Up @@ -589,7 +557,7 @@ func (nh *NVMeHandler) PopulateCurrentNVMeSessions(ctx context.Context, currSess
}

// Get the list of the subsystems currently present on the k8s node.
subs, err := GetNVMeSubsystemList(ctx)
subs, err := listSubsystemsFromSysFs(fsClient, ctx)
if err != nil {
Logc(ctx).WithField("Error", err).Errorf("Failed to get subsystem list: %v", err)
return err
Expand Down
39 changes: 35 additions & 4 deletions utils/nvme_darwin.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (

. "github.com/netapp/trident/logging"
"github.com/netapp/trident/utils/errors"
"github.com/netapp/trident/utils/filesystem"
)

// NVMeActiveOnHost checks if NVMe is active on host
Expand All @@ -24,10 +25,10 @@ func GetHostNqn(ctx context.Context) (string, error) {
}

// GetNVMeSubsystemList returns the list of subsystems connected to the k8s node.
func GetNVMeSubsystemList(ctx context.Context) (Subsystems, error) {
Logc(ctx).Debug(">>>> nvme_darwin.GetNVMeSubsystemList")
defer Logc(ctx).Debug("<<<< nvme_darwin.GetNVMeSubsystemList")
return Subsystems{}, errors.UnsupportedError("GetNVMeSubsystemList is not supported for darwin")
func listSubsystemsFromSysFs(fs filesystem.FSClient, ctx context.Context) (Subsystems, error) {
Logc(ctx).Debug(">>>> nvme_darwin.listSubsystemsFromSysFs")
defer Logc(ctx).Debug("<<<< nvme_darwin.listSubsystemsFromSysFs")
return Subsystems{}, errors.UnsupportedError("listSubsystemsFromSysFs is not supported for darwin")
}

// ConnectSubsystemToHost creates a path (or session) from the ONTAP subsystem to the k8s node using svmDataLIF.
Expand All @@ -44,6 +45,36 @@ func DisconnectSubsystemFromHost(ctx context.Context, subsysNqn string) error {
return errors.UnsupportedError("DisconnectSubsystemFromHost is not supported for darwin")
}

func GetNVMeSubsystem(ctx context.Context, fs filesystem.FSClient, nqn string) (NVMeSubsystem, error) {
Logc(ctx).Debug(">>>> nvme_darwin.GetNVMeSubsystem")
defer Logc(ctx).Debug("<<<< nvme_darwin.GetNVMeSubsystem")
return NVMeSubsystem{}, errors.UnsupportedError("GetNVMeSubsystem is not supported for darwin")
}

func GetNVMeSubsystemPaths(ctx context.Context, fs filesystem.FSClient, subsystemDirPath string) ([]Path, error) {
Logc(ctx).Debug(">>>> nvme_darwin.GetNVMeSubsystemPaths")
defer Logc(ctx).Debug("<<<< nvme_darwin.GetNVMeSubsystemPaths")
return []Path{}, errors.UnsupportedError("GetNVMeSubsystemPaths is not supported for darwin")
}

func InitializeNVMeSubsystemPath(ctx context.Context, path *Path) error {
Logc(ctx).Debug(">>>> nvme_darwin.InitializeNVMeSubsystemPath")
defer Logc(ctx).Debug("<<<< nvme_darwin.InitializeNVMeSubsystemPath")
return errors.UnsupportedError("InitializeNVMeSubsystemPath is not supported for darwin")
}

func GetNVMeDeviceCountAt(ctx context.Context, fs filesystem.FSClient, path string) (int, error) {
Logc(ctx).Debug(">>>> nvme_darwin.GetNVMeDeviceCountAt")
defer Logc(ctx).Debug("<<<< nvme_darwin.GetNVMeDeviceCountAt")
return 0, errors.UnsupportedError("GetNVMeDeviceCountAt is not supported for darwin")
}

func GetNVMeDeviceAt(ctx context.Context, path, nsUUID string) (NVMeDeviceInterface, error) {
Logc(ctx).Debug(">>>> nvme_darwin.GetNVMeDeviceAt")
defer Logc(ctx).Debug("<<<< nvme_darwin.GetNVMeDeviceAt")
return nil, errors.UnsupportedError("GetNVMeDeviceAt is not supported for darwin")
}

// GetNamespaceCountForSubsDevice returns the number of namespaces present in a given subsystem device.
func GetNamespaceCountForSubsDevice(ctx context.Context, subsDevice string) (int, error) {
Logc(ctx).Debug(">>>> nvme_darwin.GetNamespaceCount")
Expand Down
Loading

0 comments on commit a9f98f3

Please sign in to comment.