Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix dynamic vdev expansion (zpool online -e) #822

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/sys/efi_partition.h
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@ struct partition64 {
extern int efi_alloc_and_init(int, uint32_t, struct dk_gpt **);
extern int efi_alloc_and_read(int, struct dk_gpt **);
extern int efi_write(int, struct dk_gpt *);
extern int efi_rescan(int);
extern void efi_free(struct dk_gpt *);
extern int efi_type(int);
extern void efi_err_check(struct dk_gpt *);
Expand Down
63 changes: 29 additions & 34 deletions lib/libefi/rdwr_efi.c
Original file line number Diff line number Diff line change
Expand Up @@ -497,10 +497,9 @@ efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc)
return (error);
}

#if defined(__linux__)
static int
efi_rescan(int fd)
int efi_rescan(int fd)
{
#if defined(__linux__)
int retry = 5;
int error;

Expand All @@ -512,10 +511,10 @@ efi_rescan(int fd)
return (-1);
}
}
#endif

return (0);
}
#endif

static int
check_label(int fd, dk_efi_t *dk_ioc)
Expand Down Expand Up @@ -1028,24 +1027,15 @@ efi_use_whole_disk(int fd)
struct dk_gpt *efi_label;
int rval;
int i;
uint_t phy_last_slice = 0;
diskaddr_t pl_start = 0;
diskaddr_t pl_size;
uint_t resv_index = 0, data_index = 0;
diskaddr_t resv_start = 0, data_start = 0;
diskaddr_t difference;

rval = efi_alloc_and_read(fd, &efi_label);
if (rval < 0) {
return (rval);
}

/* find the last physically non-zero partition */
for (i = 0; i < efi_label->efi_nparts - 2; i ++) {
if (pl_start < efi_label->efi_parts[i].p_start) {
pl_start = efi_label->efi_parts[i].p_start;
phy_last_slice = i;
}
}
pl_size = efi_label->efi_parts[phy_last_slice].p_size;

/*
* If alter_lba is 1, we are using the backup label.
* Since we can locate the backup label by disk capacity,
Expand All @@ -1061,27 +1051,38 @@ efi_use_whole_disk(int fd)
return (VT_ENOSPC);
}

difference = efi_label->efi_last_lba - efi_label->efi_altern_lba;

/*
* If there is space between the last physically non-zero partition
* and the reserved partition, just add the unallocated space to this
* area. Otherwise, the unallocated space is added to the last
* physically non-zero partition.
* Find the last physically non-zero partition.
* This is the reserved partition.
*/
if (pl_start + pl_size - 1 == efi_label->efi_last_u_lba -
EFI_MIN_RESV_SIZE) {
efi_label->efi_parts[phy_last_slice].p_size +=
efi_label->efi_last_lba - efi_label->efi_altern_lba;
for (i = 0; i < efi_label->efi_nparts; i ++) {
if (resv_start < efi_label->efi_parts[i].p_start) {
resv_start = efi_label->efi_parts[i].p_start;
resv_index = i;
}
}

/*
* Find the last physically non-zero partition before that.
* This is the data partition.
*/
for (i = 0; i < resv_index; i ++) {
if (data_start < efi_label->efi_parts[i].p_start) {
data_start = efi_label->efi_parts[i].p_start;
data_index = i;
}
}

/*
* Move the reserved partition. There is currently no data in
* here except fabricated devids (which get generated via
* efi_write()). So there is no need to copy data.
*/
efi_label->efi_parts[efi_label->efi_nparts - 1].p_start +=
efi_label->efi_last_lba - efi_label->efi_altern_lba;
efi_label->efi_last_u_lba += efi_label->efi_last_lba
- efi_label->efi_altern_lba;
efi_label->efi_parts[data_index].p_size += difference;
efi_label->efi_parts[resv_index].p_start += difference;
efi_label->efi_last_u_lba += difference;

rval = efi_write(fd, efi_label);
if (rval < 0) {
Expand Down Expand Up @@ -1302,12 +1303,6 @@ efi_write(int fd, struct dk_gpt *vtoc)
(void) write_pmbr(fd, vtoc);
free(dk_ioc.dki_data);

#if defined(__linux__)
rval = efi_rescan(fd);
if (rval)
return (VT_ERROR);
#endif

return (0);
}

Expand Down
28 changes: 18 additions & 10 deletions lib/libzfs/libzfs_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -2081,28 +2081,30 @@ zpool_get_physpath(zpool_handle_t *zhp, char *physpath, size_t phypath_size)
* the disk to use the new unallocated space.
*/
static int
zpool_relabel_disk(libzfs_handle_t *hdl, const char *path)
zpool_relabel_disk(libzfs_handle_t *hdl, const char *path, const char *msg)
{
char errbuf[1024];
int fd, error;

if ((fd = open(path, O_RDWR|O_DIRECT)) < 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
"relabel '%s': unable to open device: %d"), path, errno);
return (zfs_error(hdl, EZFS_OPENFAILED, errbuf));
return (zfs_error(hdl, EZFS_OPENFAILED, msg));
}

/*
* It's possible that we might encounter an error if the device
* does not have any unallocated space left. If so, we simply
* ignore that error and continue on.
*
* Also, we don't call efi_rescan() - that would just return EBUSY.
* The module will do it for us in vdev_disk_open().
*/
error = efi_use_whole_disk(fd);
(void) close(fd);
if (error && error != VT_ENOSPC) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
"relabel '%s': unable to read disk capacity"), path);
return (zfs_error(hdl, EZFS_NOCAP, errbuf));
return (zfs_error(hdl, EZFS_NOCAP, msg));
}
return (0);
}
Expand Down Expand Up @@ -2141,13 +2143,10 @@ zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags,

if (flags & ZFS_ONLINE_EXPAND ||
zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) {
char *pathname = NULL;
uint64_t wholedisk = 0;

(void) nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK,
&wholedisk);
verify(nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH,
&pathname) == 0);

/*
* XXX - L2ARC 1.0 devices can't support expansion.
Expand All @@ -2159,8 +2158,17 @@ zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags,
}

if (wholedisk) {
pathname += strlen(DISK_ROOT) + 1;
(void) zpool_relabel_disk(hdl, pathname);
const char *fullpath = path;
char buf[MAXPATHLEN];
if (path[0] != '/') {
if (zfs_resolve_shortname(path, buf, sizeof(buf)))
return (zfs_error(hdl, EZFS_NODEVICE, msg));
fullpath = buf;
}

int result = zpool_relabel_disk(hdl, fullpath, msg);
if (result != 0)
return (result);
}
}

Expand Down Expand Up @@ -3814,7 +3822,7 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name)
vtoc->efi_parts[8].p_size = resv;
vtoc->efi_parts[8].p_tag = V_RESERVED;

if ((rval = efi_write(fd, vtoc)) != 0) {
if ((rval = efi_write(fd, vtoc)) != 0 || (rval = efi_rescan(fd)) != 0) {
/*
* Some block drivers (like pcata) may not support EFI
* GPT labels. Print out a helpful error message dir-
Expand Down
65 changes: 63 additions & 2 deletions module/zfs/vdev_disk.c
Original file line number Diff line number Diff line change
Expand Up @@ -158,10 +158,68 @@ vdev_elevator_switch(vdev_t *v, char *elevator)
return (error);
}

/*
* Expanding a whole disk vdev involves invoking BLKRRPART on the
* whole disk device. This poses a problem, because BLKRRPART will
* return EBUSY if one of the disk's partitions is open. That's why
* we have to do it here, just before opening the data partition.
* Unfortunately, BLKRRPART works by dropping all partitions and
* recreating them, which means that for a short time window, all
* /dev/sdxN device files disappear (until udev recreates them).
* This means two things:
* - When we open the data partition just after a BLKRRPART, we
* can't do it using the normal device file path because of the
* obvious race condition with udev. Instead, we use reliable
* kernel APIs to get a handle to the new partition device from
* the whole disk device.
* - Because vdev_disk_open() initially needs to find the device
* using its path, multiple vdev_disk_open() invocations in
* short succession on the same disk with BLKRRPARTs in the
* middle have a high probability of failure (because of the
* race condition with udev). A typical situation where this
* might happen is when the zpool userspace tool does a
* TRYIMPORT immediately followed by an IMPORT. For this
* reason, we only invoke BLKRRPART in the module when strictly
* necessary (zpool online -e case), and rely on userspace to
* do it when possible.
*/
static struct block_device * vdev_disk_rrpart(const char *path, int mode, vdev_disk_t *vd)
{
struct block_device *result = NULL, *bdev;
struct gendisk *disk;
int error, partno;

bdev = vdev_bdev_open(path, vdev_bdev_mode(mode), vd);
if (bdev) {
disk = get_gendisk(bdev->bd_dev, &partno);
vdev_bdev_close(bdev, vdev_bdev_mode(mode));

if (disk) {
bdev = bdget(disk_devt(disk));
if (bdev) {
error = blkdev_get(bdev, vdev_bdev_mode(mode), vd);
if (error == 0)
error = ioctl_by_bdev(bdev, BLKRRPART, 0);
vdev_bdev_close(bdev, vdev_bdev_mode(mode));
}

bdev = bdget_disk(disk, partno);
if (bdev) {
error = blkdev_get(bdev, vdev_bdev_mode(mode) | FMODE_EXCL, vd);
if (error == 0)
result = bdev;
}
put_disk(disk);
}
}

return result;
}

static int
vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift)
{
struct block_device *bdev;
struct block_device *bdev = NULL;
vdev_disk_t *vd;
int mode, block_size;

Expand Down Expand Up @@ -190,7 +248,10 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *ashift)
* level vdev validation.
*/
mode = spa_mode(v->vdev_spa);
bdev = vdev_bdev_open(v->vdev_path, vdev_bdev_mode(mode), vd);
if (v->vdev_wholedisk && v->vdev_expanding)
bdev = vdev_disk_rrpart(v->vdev_path, mode, vd);
if (!bdev)
bdev = vdev_bdev_open(v->vdev_path, vdev_bdev_mode(mode), vd);
if (IS_ERR(bdev)) {
kmem_free(vd, sizeof(vdev_disk_t));
return -PTR_ERR(bdev);
Expand Down