From 5549a537dda49b1cf280f7fdee5ac7f0cc217438 Mon Sep 17 00:00:00 2001 From: Prakash Surya Date: Tue, 13 Aug 2019 20:18:53 -0700 Subject: [PATCH] Fix device expansion when VM is powered off When running on an ESXi based VM, I've found that "zpool online -e" will not expand the zpool, if the disk was expanded in ESXi while the VM was powered off. For example, take the following scenario: 1. VM running on top of VMware ESXi 2. ZFS pool created with a given device "sda" of size 8GB 3. VM powered off 4. Device "sda" size expanded to 16GB 5. VM powered on 6. "zpool online -e" used on device "sda" In this situation, after (2) the zpool will be roughly 8GB in size. After (6), the expectation is the zpool's size will expand to roughly 16GB in size; i.e. expand to the new size of the "sda" device. Unfortunately, I've seen that after (6), the zpool size does not change. What's happening is after (5), the EFI label of the "sda" device will be such that fields "efi_last_u_lba", "efi_last_lba", and "efi_altern_lba" all reflect the new size of the disk; i.e. "33554398", "33554431", and "33554431" respectively. Thus, the check that we perform in "efi_use_whole_disk": if ((efi_label->efi_altern_lba == 1) || (efi_label->efi_altern_lba >= efi_label->efi_last_lba)) { This will return true, and then we return from the function without having expanded the size of the zpool/device. In contrast, if we remove steps (3) and (5) in the sequence above, i.e. the device is expanded while the VM is powered on, things change. In that case, the fields "efi_last_u_lba" and "efi_altern_lba" do not change (i.e. they still reflect the old 8GB device size), but the "efi_last_lba" field does change (i.e. it now reflects the new 16GB device size). Thus, when we evaluate the same conditional in "efi_use_whole_disk", it'll return false, so the zpool is expanded. Taking all of this into account, this PR updates "efi_use_whole_disk" to properly expand the zpool when the underlying disk is expanded while the VM is powered off. Reviewed-by: Brian Behlendorf Reviewed-by: George Wilson Reviewed-by: Don Brady Signed-off-by: Prakash Surya Closes #9111 --- lib/libefi/rdwr_efi.c | 112 ++++++++++++++++++++++++++++++++---------- 1 file changed, 87 insertions(+), 25 deletions(-) diff --git a/lib/libefi/rdwr_efi.c b/lib/libefi/rdwr_efi.c index 1d8f631c837d..93c79277dae5 100644 --- a/lib/libefi/rdwr_efi.c +++ b/lib/libefi/rdwr_efi.c @@ -42,6 +42,7 @@ #include #include #include +#include #include static struct uuid_to_ptag { @@ -1113,7 +1114,9 @@ efi_use_whole_disk(int fd) int i; uint_t resv_index = 0, data_index = 0; diskaddr_t resv_start = 0, data_start = 0; - diskaddr_t difference; + diskaddr_t data_size, limit, difference; + boolean_t sync_needed = B_FALSE; + uint_t nblocks; rval = efi_alloc_and_read(fd, &efi_label); if (rval < 0) { @@ -1122,13 +1125,67 @@ efi_use_whole_disk(int fd) return (rval); } + /* + * Find the last physically non-zero partition. + * This should be the reserved partition. + */ + for (i = 0; i < efi_label->efi_nparts; i ++) { + if (resv_start < efi_label->efi_parts[i].p_start) { + resv_start = efi_label->efi_parts[i].p_start; + resv_index = i; + } + } + + /* + * Find the last physically non-zero partition before that. + * This is the data partition. + */ + for (i = 0; i < resv_index; i ++) { + if (data_start < efi_label->efi_parts[i].p_start) { + data_start = efi_label->efi_parts[i].p_start; + data_index = i; + } + } + data_size = efi_label->efi_parts[data_index].p_size; + + /* + * See the "efi_alloc_and_init" function for more information + * about where this "nblocks" value comes from. + */ + nblocks = efi_label->efi_first_u_lba - 1; + + /* + * Determine if the EFI label is out of sync. We check that: + * + * 1. the data partition ends at the limit we set, and + * 2. the reserved partition starts at the limit we set. + * + * If either of these conditions is not met, then we need to + * resync the EFI label. + * + * The limit is the last usable LBA, determined by the last LBA + * and the first usable LBA fields on the EFI label of the disk + * (see the lines directly above). Additionally, we factor in + * EFI_MIN_RESV_SIZE (per its use in "zpool_label_disk") and + * P2ALIGN it to ensure the partition boundaries are aligned + * (for performance reasons). The alignment should match the + * alignment used by the "zpool_label_disk" function. + */ + limit = P2ALIGN(efi_label->efi_last_lba - nblocks - EFI_MIN_RESV_SIZE, + PARTITION_END_ALIGNMENT); + if (data_start + data_size != limit || resv_start != limit) + sync_needed = B_TRUE; + + if (efi_debug && sync_needed) + (void) fprintf(stderr, "efi_use_whole_disk: sync needed\n"); + /* * If alter_lba is 1, we are using the backup label. * Since we can locate the backup label by disk capacity, * there must be no unallocated space. */ if ((efi_label->efi_altern_lba == 1) || (efi_label->efi_altern_lba - >= efi_label->efi_last_lba)) { + >= efi_label->efi_last_lba && !sync_needed)) { if (efi_debug) { (void) fprintf(stderr, "efi_use_whole_disk: requested space not found\n"); @@ -1137,19 +1194,6 @@ efi_use_whole_disk(int fd) return (VT_ENOSPC); } - difference = efi_label->efi_last_lba - efi_label->efi_altern_lba; - - /* - * Find the last physically non-zero partition. - * This should be the reserved partition. - */ - for (i = 0; i < efi_label->efi_nparts; i ++) { - if (resv_start < efi_label->efi_parts[i].p_start) { - resv_start = efi_label->efi_parts[i].p_start; - resv_index = i; - } - } - /* * Verify that we've found the reserved partition by checking * that it looks the way it did when we created it in zpool_label_disk. @@ -1167,17 +1211,36 @@ efi_use_whole_disk(int fd) return (VT_ENOSPC); } - /* - * Find the last physically non-zero partition before that. - * This is the data partition. - */ - for (i = 0; i < resv_index; i ++) { - if (data_start < efi_label->efi_parts[i].p_start) { - data_start = efi_label->efi_parts[i].p_start; - data_index = i; + if (data_start + data_size != resv_start) { + if (efi_debug) { + (void) fprintf(stderr, + "efi_use_whole_disk: " + "data_start (%lli) + " + "data_size (%lli) != " + "resv_start (%lli)\n", + data_start, data_size, resv_start); + } + + return (VT_EINVAL); + } + + if (limit < resv_start) { + if (efi_debug) { + (void) fprintf(stderr, + "efi_use_whole_disk: " + "limit (%lli) < resv_start (%lli)\n", + limit, resv_start); } + + return (VT_EINVAL); } + difference = limit - resv_start; + + if (efi_debug) + (void) fprintf(stderr, + "efi_use_whole_disk: difference is %lli\n", difference); + /* * Move the reserved partition. There is currently no data in * here except fabricated devids (which get generated via @@ -1185,7 +1248,7 @@ efi_use_whole_disk(int fd) */ efi_label->efi_parts[data_index].p_size += difference; efi_label->efi_parts[resv_index].p_start += difference; - efi_label->efi_last_u_lba += difference; + efi_label->efi_last_u_lba = efi_label->efi_last_lba - nblocks; rval = efi_write(fd, efi_label); if (rval < 0) { @@ -1202,7 +1265,6 @@ efi_use_whole_disk(int fd) return (0); } - /* * write EFI label and backup label */