Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use lsblk to safeguard against outdated symlinks #1878

Merged
merged 1 commit into from
Jan 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 41 additions & 1 deletion pkg/driver/node_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@ package driver
import (
"fmt"
"os"
"os/exec"
"path/filepath"
"regexp"
"strconv"
"strings"

Expand All @@ -47,6 +49,7 @@ func (d *nodeService) appendPartition(devicePath, partition string) string {
// if the device is not nvme, return the path directly
// if the device is nvme, finds and returns the nvme device path eg. /dev/nvme1n1
func (d *nodeService) findDevicePath(devicePath, volumeID, partition string) (string, error) {
strippedVolumeName := strings.Replace(volumeID, "-", "", -1)
canonicalDevicePath := ""

// If the given path exists, the device MAY be nvme. Further, it MAY be a
Expand Down Expand Up @@ -76,6 +79,9 @@ func (d *nodeService) findDevicePath(devicePath, volumeID, partition string) (st
}

klog.V(5).InfoS("[Debug] The canonical device path was resolved", "devicePath", devicePath, "cacanonicalDevicePath", canonicalDevicePath)
if err = verifyVolumeSerialMatch(canonicalDevicePath, strippedVolumeName, execRunner); err != nil {
return "", err
}
return d.appendPartition(canonicalDevicePath, partition), nil
}

Expand All @@ -87,13 +93,16 @@ func (d *nodeService) findDevicePath(devicePath, volumeID, partition string) (st
// which AWS presents NVME devices under /dev/disk/by-id/. For example,
// vol-0fab1d5e3f72a5e23 creates a symlink at
// /dev/disk/by-id/nvme-Amazon_Elastic_Block_Store_vol0fab1d5e3f72a5e23
nvmeName := "nvme-Amazon_Elastic_Block_Store_" + strings.Replace(volumeID, "-", "", -1)
nvmeName := "nvme-Amazon_Elastic_Block_Store_" + strippedVolumeName

nvmeDevicePath, err := findNvmeVolume(d.deviceIdentifier, nvmeName)

if err == nil {
klog.V(5).InfoS("[Debug] successfully resolved", "nvmeName", nvmeName, "nvmeDevicePath", nvmeDevicePath)
canonicalDevicePath = nvmeDevicePath
if err = verifyVolumeSerialMatch(canonicalDevicePath, strippedVolumeName, execRunner); err != nil {
return "", err
}
return d.appendPartition(canonicalDevicePath, partition), nil
} else {
klog.V(5).InfoS("[Debug] error searching for nvme path", "nvmeName", nvmeName, "err", err)
Expand All @@ -116,6 +125,37 @@ func (d *nodeService) findDevicePath(devicePath, volumeID, partition string) (st
return canonicalDevicePath, nil
}

// Helper to inject exec.Comamnd().CombinedOutput() for verifyVolumeSerialMatch
// Tests use a mocked version that does not actually execute any binaries
func execRunner(name string, arg ...string) ([]byte, error) {
return exec.Command(name, arg...).CombinedOutput()
}

func verifyVolumeSerialMatch(canonicalDevicePath string, strippedVolumeName string, execRunner func(string, ...string) ([]byte, error)) error {
// In some rare cases, a race condition can lead to the /dev/disk/by-id/ symlink becoming out of date
// See https://github.com/kubernetes-sigs/aws-ebs-csi-driver/issues/1224 for more info
// Attempt to use lsblk to double check that the nvme device selected was the correct volume
output, err := execRunner("lsblk", "--noheadings", "--ascii", "--nodeps", "--output", "SERIAL", canonicalDevicePath)

if err == nil {
// Look for an EBS volume ID in the output, compare all matches against what we expect
// (in some rare cases there may be multiple matches due to lsblk printing partitions)
// If no volume ID is in the output (non-Nitro instances, SBE devices, etc) silently proceed
volumeRegex := regexp.MustCompile(`vol[a-z0-9]+`)
for _, volume := range volumeRegex.FindAllString(string(output), -1) {
klog.V(6).InfoS("Comparing volume serial", "canonicalDevicePath", canonicalDevicePath, "expected", strippedVolumeName, "actual", volume)
if volume != strippedVolumeName {
return fmt.Errorf("Refusing to mount %s because it claims to be %s but should be %s", canonicalDevicePath, volume, strippedVolumeName)
}
}
} else {
// If the command fails (for example, because lsblk is not available), silently ignore the error and proceed
klog.V(5).ErrorS(err, "Ignoring lsblk failure", "canonicalDevicePath", canonicalDevicePath, "strippedVolumeName", strippedVolumeName)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

np: How about increasing the severity? This can help customers (and us) debug this issue faster.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I debated with myself about this, so I'd like to hear other's opinions.

On one hand, a broken lsblk is an arguably incorrect installation environment.

On the other hand, it feels unhelpful to flood the logs with the same error for every staged volume if lsblk is missing from the container or similar.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thought about this quite a bit an ultimately landed on the following as well:

it feels unhelpful to flood the logs with the same error for every staged volume if lsblk is missing from the container or similar.

}

return nil
}

func errNoDevicePathFound(devicePath, volumeID string) error {
return fmt.Errorf("no device path for device %q volume %q found", devicePath, volumeID)
}
Expand Down
64 changes: 64 additions & 0 deletions pkg/driver/node_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ limitations under the License.
package driver

import (
"fmt"
"io/fs"
"os"
"testing"
Expand Down Expand Up @@ -190,6 +191,69 @@ func TestFindDevicePath(t *testing.T) {
}
}

const fakeVolumeName = "vol11111111111111111"
const fakeIncorrectVolumeName = "vol21111111111111111"

func TestVerifyVolumeSerialMatch(t *testing.T) {
type testCase struct {
name string
execOutput string
execError error
expectError bool
}
testCases := []testCase{
{
name: "success: empty",
execOutput: "",
},
{
name: "success: single",
execOutput: fakeVolumeName,
},
{
name: "success: multiple",
execOutput: fakeVolumeName + "\n" + fakeVolumeName + "\n" + fakeVolumeName,
},
{
name: "success: whitespace",
execOutput: "\t " + fakeVolumeName + " \n \t ",
},
{
name: "success: extra output",
execOutput: "extra output without name in it\n" + fakeVolumeName,
},
{
name: "success: failed command",
execError: fmt.Errorf("Exec failed"),
},
{
name: "failure: wrong volume",
execOutput: fakeIncorrectVolumeName,
expectError: true,
},
{
name: "failure: mixed",
execOutput: fakeVolumeName + "\n" + fakeIncorrectVolumeName,
expectError: true,
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
mockExecRunner := func(_ string, _ ...string) ([]byte, error) {
return []byte(tc.execOutput), tc.execError
}

result := verifyVolumeSerialMatch("path", fakeVolumeName, mockExecRunner)
if tc.expectError {
assert.Error(t, result)
} else {
assert.NoError(t, result)
}
})
}
}

type fakeFileInfo struct {
name string
mode os.FileMode
Expand Down
Loading