Skip to content

Commit

Permalink
feature: block-device mounts (microsoft#2168)
Browse files Browse the repository at this point in the history
This PR adds capability to mount virtual and passthrough disks
as block devices inside containers.

We add a new "blockdev://" prefix to OCI `Mount.ContainerPath`,
which indicates that the source should be mounted as a blcok
device.

A new `BlockDev` field has been added to `mountConfig` used by
`mountManager`, which indicates that the SCSI attachment should
be mounted as a block device.

The GCS has also been updated to handle `BlockDev`. Instead of
mounting the filesystem, GCS creates a symlink to the block device
corresponding to the SCSI attachment. The symlink path is set
by shim as a source of bind mount in OCI container spec. GCS
resolves the symlink and adds the corresponding device cgroup.
Without the cgroup, the container won't be able to work with the
block device.

We chose a symlink approach instead of bind mounting the device
directly, because the shim doesn't know the path at which the
device will appear inside UVM. For this to work, we either need
to encode the SCSI controller/LUN in the OCI mount's HostPath or
update the communication protocol between the shim and GCS, where
GCS would either return the device path, or add capability for
the shim to query for it.

Below are some CRI container config examples for physical and
virtual disks:

Passthrough physical disk:
```json
{
    ...
    "mounts": [
        {
            "host_path": "\\\\.\\PHYSICALDRIVE1",
            "container_path": "blockdev:///my/block/mount",
            "readonly": false
        }
    ]
    ...
}
```

Virtual VHD disk:
```json
{
    ...
    "mounts": [
        {
            "host_path": "C:\\path\\to\\my\\disk.vhdx",
            "container_path": "blockdev:///my/block/mount",
            "readonly": false
        }
    ]
    ...
}
```

Mount manager will differentiate between a block device and a
filesystem mount. Two containers can use the same managed disk
inside UVM as a block device or filesystem at the same time.
For block device mount a symlink will be created, for filesystem
mount the block device will be mounted in the UVM.
```
bash-5.0# ls -l /run/mounts/scsi/
total 16
drwxr-xr-x    3 root     root          4096 Jan  1  1970 m0
drwxr-xr-x    4 root     root          4096 Jun 20 23:20 m1
drwxr-xr-x   18 root     root          4096 Jan  1  1970 m2
drwxr-xr-x    3 root     root          4096 Jun 20 23:20 m3
lrwxrwxrwx    1 root     root             8 Jun 20 23:22 m4 -> /dev/sde
bash-5.0# mount | grep sde
/dev/sde on /run/mounts/scsi/m3 type ext4 (rw,relatime)
```

Signed-off-by: Maksim An <[email protected]>
  • Loading branch information
anmaxvl authored Jun 26, 2024
1 parent e96bfcd commit 53f2486
Show file tree
Hide file tree
Showing 9 changed files with 98 additions and 5 deletions.
2 changes: 2 additions & 0 deletions internal/guest/runtime/hcsv2/uvm.go
Original file line number Diff line number Diff line change
Expand Up @@ -1005,6 +1005,7 @@ func modifyMappedVirtualDisk(
VerityInfo: verityInfo,
EnsureFilesystem: mvd.EnsureFilesystem,
Filesystem: mvd.Filesystem,
BlockDev: mvd.BlockDev,
}
return scsi.Mount(mountCtx, mvd.Controller, mvd.Lun, mvd.Partition, mvd.MountPath,
mvd.ReadOnly, mvd.Options, config)
Expand All @@ -1022,6 +1023,7 @@ func modifyMappedVirtualDisk(
VerityInfo: verityInfo,
EnsureFilesystem: mvd.EnsureFilesystem,
Filesystem: mvd.Filesystem,
BlockDev: mvd.BlockDev,
}
if err := scsi.Unmount(ctx, mvd.Controller, mvd.Lun, mvd.Partition,
mvd.MountPath, config); err != nil {
Expand Down
45 changes: 45 additions & 0 deletions internal/guest/runtime/hcsv2/workload_container.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@ package hcsv2

import (
"context"
"fmt"
"os"
"path/filepath"
"strings"

"github.com/opencontainers/runc/libcontainer/devices"
oci "github.com/opencontainers/runtime-spec/specs-go"
"github.com/pkg/errors"
"go.opencensus.io/trace"
Expand Down Expand Up @@ -84,6 +86,45 @@ func updateHugePageMounts(sbid string, spec *oci.Spec) error {
return nil
}

func updateBlockDeviceMounts(spec *oci.Spec) error {
for i, m := range spec.Mounts {
if !strings.HasPrefix(m.Destination, guestpath.BlockDevMountPrefix) {
continue
}
permissions := "rwm"
for _, o := range m.Options {
if o == "ro" {
permissions = "r"
break
}
}

// For block device mounts, the source will be a symlink. Resolve it first
// before passing to `DeviceFromPath`, which expects a real device path.
rPath, err := os.Readlink(m.Source)
if err != nil {
return fmt.Errorf("failed to readlink %s: %w", m.Source, err)
}

sourceDevice, err := devices.DeviceFromPath(rPath, permissions)
if err != nil {
return fmt.Errorf("failed to get device from path: %w", err)
}

deviceCgroup := oci.LinuxDeviceCgroup{
Allow: true,
Type: string(sourceDevice.Type),
Major: &sourceDevice.Major,
Minor: &sourceDevice.Minor,
Access: string(sourceDevice.Permissions),
}

spec.Linux.Resources.Devices = append(spec.Linux.Resources.Devices, deviceCgroup)
spec.Mounts[i].Destination = strings.TrimPrefix(m.Destination, guestpath.BlockDevMountPrefix)
}
return nil
}

func specHasGPUDevice(spec *oci.Spec) bool {
for _, d := range spec.Windows.Devices {
if d.IDType == "gpu" {
Expand Down Expand Up @@ -115,6 +156,10 @@ func setupWorkloadContainerSpec(ctx context.Context, sbid, id string, spec *oci.
return errors.Wrapf(err, "failed to update hugepages mounts for container %v in sandbox %v", id, sbid)
}

if err = updateBlockDeviceMounts(spec); err != nil {
return fmt.Errorf("failed to update block device mounts for container %v in sandbox %v: %w", id, sbid, err)
}

// Add default mounts for container networking (e.g. /etc/hostname, /etc/hosts),
// if spec didn't override them explicitly.
networkingMounts := specInternal.GenerateWorkloadContainerNetworkMounts(sbid, spec)
Expand Down
25 changes: 25 additions & 0 deletions internal/guest/storage/scsi/scsi.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
"time"

"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"go.opencensus.io/trace"
"golang.org/x/sys/unix"

Expand All @@ -34,6 +35,7 @@ import (
var (
osMkdirAll = os.MkdirAll
osRemoveAll = os.RemoveAll
osSymlink = os.Symlink
unixMount = unix.Mount

// mock functions for testing getDevicePath
Expand Down Expand Up @@ -112,6 +114,7 @@ type Config struct {
VerityInfo *guestresource.DeviceVerityInfo
EnsureFilesystem bool
Filesystem string
BlockDev bool
}

// Mount creates a mount from the SCSI device on `controller` index `lun` to
Expand Down Expand Up @@ -163,6 +166,19 @@ func Mount(
}
}

// create and symlink block device mount target
if config.BlockDev {
parent := filepath.Dir(target)
if err := osMkdirAll(parent, 0700); err != nil {
return err
}
log.G(ctx).WithFields(logrus.Fields{
"source": source,
"target": target,
}).Trace("creating block device symlink")
return osSymlink(source, target)
}

if err := osMkdirAll(target, 0700); err != nil {
return err
}
Expand Down Expand Up @@ -280,6 +296,15 @@ func Unmount(
trace.Int64Attribute("partition", int64(partition)),
trace.StringAttribute("target", target))

// skip unmount logic for block devices, since they are just symlinks
if config.BlockDev {
log.G(ctx).WithField("target", target).Trace("removing block device symlink")
if err := osRemoveAll(target); err != nil {
return fmt.Errorf("failed to remove symlink: %w", err)
}
return nil
}

// unmount target
if err := storageUnmountPath(ctx, target, true); err != nil {
return errors.Wrapf(err, "unmount failed: %s", target)
Expand Down
3 changes: 3 additions & 0 deletions internal/guestpath/paths.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ const (
// HugePagesMountPrefix is mount prefix used in container spec to mark a
// huge-pages mount
HugePagesMountPrefix = "hugepages://"
// BlockDevMountPrefix is mount prefix used in container spec to mark a
// block-device mount.
BlockDevMountPrefix = "blockdev://"
// PipePrefix is the mount prefix used in container spec to mark a named pipe
PipePrefix = `\\.\pipe`
// LCOWMountPathPrefixFmt is the path format in the LCOW UVM where
Expand Down
19 changes: 14 additions & 5 deletions internal/hcsoci/resources_lcow.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,22 +83,27 @@ func allocateLinuxResources(ctx context.Context, coi *createOptionsInternal, r *
}

l := log.G(ctx).WithField("mount", fmt.Sprintf("%+v", mount))

isBlockDev := strings.HasPrefix(mount.Destination, guestpath.BlockDevMountPrefix)
if mount.Type == MountTypePhysicalDisk {
l.Debug("hcsshim::allocateLinuxResources Hot-adding SCSI physical disk for OCI mount")
scsiMount, err := coi.HostingSystem.SCSIManager.AddPhysicalDisk(
ctx,
hostPath,
readOnly,
coi.HostingSystem.ID(),
&scsi.MountConfig{Options: mount.Options},
&scsi.MountConfig{Options: mount.Options, BlockDev: isBlockDev},
)
if err != nil {
return errors.Wrapf(err, "adding SCSI physical disk mount %+v", mount)
}

uvmPathForFile = scsiMount.GuestPath()
r.Add(scsiMount)
coi.Spec.Mounts[i].Type = "none"
mt := "none"
if isBlockDev {
mt = "bind"
}
coi.Spec.Mounts[i].Type = mt
} else if mount.Type == MountTypeVirtualDisk {
l.Debug("hcsshim::allocateLinuxResources Hot-adding SCSI virtual disk for OCI mount")

Expand All @@ -109,15 +114,19 @@ func allocateLinuxResources(ctx context.Context, coi *createOptionsInternal, r *
hostPath,
readOnly,
coi.HostingSystem.ID(),
&scsi.MountConfig{Options: mount.Options},
&scsi.MountConfig{Options: mount.Options, BlockDev: isBlockDev},
)
if err != nil {
return errors.Wrapf(err, "adding SCSI virtual disk mount %+v", mount)
}

uvmPathForFile = scsiMount.GuestPath()
r.Add(scsiMount)
coi.Spec.Mounts[i].Type = "none"
mt := "none"
if isBlockDev {
mt = "bind"
}
coi.Spec.Mounts[i].Type = mt
} else if strings.HasPrefix(mount.Source, guestpath.SandboxMountPrefix) {
// Mounts that map to a path in UVM are specified with 'sandbox://' prefix.
// example: sandbox:///a/dirInUvm destination:/b/dirInContainer
Expand Down
1 change: 1 addition & 0 deletions internal/protocol/guestresource/resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ type LCOWMappedVirtualDisk struct {
ReadOnly bool `json:"ReadOnly,omitempty"`
Encrypted bool `json:"Encrypted,omitempty"`
Options []string `json:"Options,omitempty"`
BlockDev bool `json:"BlockDev,omitempty"`
// Deprecated: verity info is read by the guest
VerityInfo *DeviceVerityInfo `json:"VerityInfo,omitempty"`
EnsureFilesystem bool `json:"EnsureFilesystem,omitempty"`
Expand Down
2 changes: 2 additions & 0 deletions internal/uvm/scsi/backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,7 @@ func mountRequest(controller, lun uint, path string, config *mountConfig, osType
Options: config.options,
EnsureFilesystem: config.ensureFilesystem,
Filesystem: config.filesystem,
BlockDev: config.blockDev,
}
default:
return guestrequest.ModificationRequest{}, fmt.Errorf("unsupported os type: %s", osType)
Expand All @@ -221,6 +222,7 @@ func unmountRequest(controller, lun uint, path string, config *mountConfig, osTy
Lun: uint8(lun),
Partition: config.partition,
Controller: uint8(controller),
BlockDev: config.blockDev,
}
default:
return guestrequest.ModificationRequest{}, fmt.Errorf("unsupported os type: %s", osType)
Expand Down
5 changes: 5 additions & 0 deletions internal/uvm/scsi/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ type MountConfig struct {
// mounted as.
// This is only supported for LCOW.
Filesystem string
// BlockDev indicates if the device should be mounted as a block device.
// This is only supported for LCOW.
BlockDev bool
}

// Mount represents a SCSI device that has been attached to a VM, and potentially
Expand Down Expand Up @@ -157,6 +160,7 @@ func (m *Manager) AddVirtualDisk(
options: mc.Options,
ensureFilesystem: mc.EnsureFilesystem,
filesystem: mc.Filesystem,
blockDev: mc.BlockDev,
}
}
return m.add(ctx,
Expand Down Expand Up @@ -202,6 +206,7 @@ func (m *Manager) AddPhysicalDisk(
options: mc.Options,
ensureFilesystem: mc.EnsureFilesystem,
filesystem: mc.Filesystem,
blockDev: mc.BlockDev,
}
}
return m.add(ctx,
Expand Down
1 change: 1 addition & 0 deletions internal/uvm/scsi/mount.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ type mountConfig struct {
partition uint64
readOnly bool
encrypted bool
blockDev bool
options []string
ensureFilesystem bool
filesystem string
Expand Down

0 comments on commit 53f2486

Please sign in to comment.