forked from openzfs/zfs
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
OpenZFS 9318 - vol_volsize_to_reservation does not account for raidz …
…skip blocks When a volume is created in a pool with raidz vdevs and volblocksize != 128k, the volume can reference more space than is reserved with the automatically calculated refreservation. There are two deficiencies in vol_volsize_to_reservation that contribute to this: 1) Skip blocks may be added to keep each allocation a multiple of parity + 1. This is the dominating factor when volblocksize is close to 2^ashift. 2) raidz deflation for 128 KB blocks is different for most other block sizes. See "The theory of raidz space accounting" comment in libzfs_dataset.c for a full explanation. Authored by: Mike Gerdts <[email protected]> Reviewed by: Richard Elling <[email protected]> Reviewed by: Sanjay Nadkarni <[email protected]> Reviewed by: Jerry Jelinek <[email protected]> Reviewed by: Matt Ahrens <[email protected]> Reviewed by: Kody Kantor <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Approved by: Dan McDonald <[email protected]> Ported-by: Mike Gerdts <[email protected]> Porting Notes: * ZTS: wait for zvols to exist before writing * ZTS: use log_must_busy with {zpool|zfs} destroy OpenZFS-issue: https://www.illumos.org/issues/9318 OpenZFS-commit: illumos/illumos-gate@b73ccab0 Closes openzfs#8973
- Loading branch information
Mike Gerdts
authored and
TulsiJain
committed
Jul 20, 2019
1 parent
e3fa53e
commit 019b7f8
Showing
7 changed files
with
520 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,6 +29,7 @@ | |
* Copyright 2016 Nexenta Systems, Inc. | ||
* Copyright (c) 2019 Datto Inc. | ||
* Copyright (c) 2019, loli10K <[email protected]> | ||
* Copyright 2019 Joyent, Inc. | ||
*/ | ||
|
||
#include <assert.h> | ||
|
@@ -998,10 +999,11 @@ zfs_do_create(int argc, char **argv) | |
zpool_close(zpool_handle); | ||
goto error; | ||
} | ||
zpool_close(zpool_handle); | ||
|
||
volsize = zvol_volsize_to_reservation(volsize, real_props); | ||
volsize = zvol_volsize_to_reservation(zpool_handle, volsize, | ||
real_props); | ||
nvlist_free(real_props); | ||
zpool_close(zpool_handle); | ||
|
||
if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop), | ||
&strval) != 0) { | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,7 +21,7 @@ | |
|
||
/* | ||
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | ||
* Copyright (c) 2018, Joyent, Inc. All rights reserved. | ||
* Copyright 2019 Joyent, Inc. | ||
* Copyright (c) 2011, 2018 by Delphix. All rights reserved. | ||
* Copyright (c) 2012 DEY Storage Systems, Inc. All rights reserved. | ||
* Copyright (c) 2012 Pawel Jakub Dawidek <[email protected]>. | ||
|
@@ -1617,6 +1617,7 @@ zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl) | |
uint64_t new_reservation; | ||
zfs_prop_t resv_prop; | ||
nvlist_t *props; | ||
zpool_handle_t *zph = zpool_handle(zhp); | ||
|
||
/* | ||
* If this is an existing volume, and someone is setting the volsize, | ||
|
@@ -1631,7 +1632,7 @@ zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl) | |
fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), | ||
zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE)); | ||
|
||
if ((zvol_volsize_to_reservation(old_volsize, props) != | ||
if ((zvol_volsize_to_reservation(zph, old_volsize, props) != | ||
old_reservation) || nvlist_exists(nvl, | ||
zfs_prop_to_name(resv_prop))) { | ||
fnvlist_free(props); | ||
|
@@ -1642,7 +1643,7 @@ zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl) | |
fnvlist_free(props); | ||
return (-1); | ||
} | ||
new_reservation = zvol_volsize_to_reservation(new_volsize, props); | ||
new_reservation = zvol_volsize_to_reservation(zph, new_volsize, props); | ||
fnvlist_free(props); | ||
|
||
if (nvlist_add_uint64(nvl, zfs_prop_to_name(resv_prop), | ||
|
@@ -1697,7 +1698,8 @@ zfs_fix_auto_resv(zfs_handle_t *zhp, nvlist_t *nvl) | |
volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); | ||
} | ||
|
||
resvsize = zvol_volsize_to_reservation(volsize, props); | ||
resvsize = zvol_volsize_to_reservation(zpool_handle(zhp), volsize, | ||
props); | ||
fnvlist_free(props); | ||
|
||
(void) nvlist_remove_all(nvl, zfs_prop_to_name(prop)); | ||
|
@@ -5376,12 +5378,176 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl) | |
} | ||
|
||
/* | ||
* Convert the zvol's volume size to an appropriate reservation. | ||
* The theory of raidz space accounting | ||
* | ||
* The "referenced" property of RAIDZ vdevs is scaled such that a 128KB block | ||
* will "reference" 128KB, even though it allocates more than that, to store the | ||
* parity information (and perhaps skip sectors). This concept of the | ||
* "referenced" (and other DMU space accounting) being lower than the allocated | ||
* space by a constant factor is called "raidz deflation." | ||
* | ||
* As mentioned above, the constant factor for raidz deflation assumes a 128KB | ||
* block size. However, zvols typically have a much smaller block size (default | ||
* 8KB). These smaller blocks may require proportionally much more parity | ||
* information (and perhaps skip sectors). In this case, the change to the | ||
* "referenced" property may be much more than the logical block size. | ||
* | ||
* Suppose a raidz vdev has 5 disks with ashift=12. A 128k block may be written | ||
* as follows. | ||
* | ||
* +-------+-------+-------+-------+-------+ | ||
* | disk1 | disk2 | disk3 | disk4 | disk5 | | ||
* +-------+-------+-------+-------+-------+ | ||
* | P0 | D0 | D8 | D16 | D24 | | ||
* | P1 | D1 | D9 | D17 | D25 | | ||
* | P2 | D2 | D10 | D18 | D26 | | ||
* | P3 | D3 | D11 | D19 | D27 | | ||
* | P4 | D4 | D12 | D20 | D28 | | ||
* | P5 | D5 | D13 | D21 | D29 | | ||
* | P6 | D6 | D14 | D22 | D30 | | ||
* | P7 | D7 | D15 | D23 | D31 | | ||
* +-------+-------+-------+-------+-------+ | ||
* | ||
* Above, notice that 160k was allocated: 8 x 4k parity sectors + 32 x 4k data | ||
* sectors. The dataset's referenced will increase by 128k and the pool's | ||
* allocated and free properties will be adjusted by 160k. | ||
* | ||
* A 4k block written to the same raidz vdev will require two 4k sectors. The | ||
* blank cells represent unallocated space. | ||
* | ||
* +-------+-------+-------+-------+-------+ | ||
* | disk1 | disk2 | disk3 | disk4 | disk5 | | ||
* +-------+-------+-------+-------+-------+ | ||
* | P0 | D0 | | | | | ||
* +-------+-------+-------+-------+-------+ | ||
* | ||
* Above, notice that the 4k block required one sector for parity and another | ||
* for data. vdev_raidz_asize() will return 8k and as such the pool's allocated | ||
* and free properties will be adjusted by 8k. The dataset will not be charged | ||
* 8k. Rather, it will be charged a value that is scaled according to the | ||
* overhead of the 128k block on the same vdev. This 8k allocation will be | ||
* charged 8k * 128k / 160k. 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as | ||
* calculated in the 128k block example above. | ||
* | ||
* Every raidz allocation is sized to be a multiple of nparity+1 sectors. That | ||
* is, every raidz1 allocation will be a multiple of 2 sectors, raidz2 | ||
* allocations are a multiple of 3 sectors, and raidz3 allocations are a | ||
* multiple of of 4 sectors. When a block does not fill the required number of | ||
* sectors, skip blocks (sectors) are used. | ||
* | ||
* An 8k block being written to a raidz vdev may be written as follows: | ||
* | ||
* +-------+-------+-------+-------+-------+ | ||
* | disk1 | disk2 | disk3 | disk4 | disk5 | | ||
* +-------+-------+-------+-------+-------+ | ||
* | P0 | D0 | D1 | S0 | | | ||
* +-------+-------+-------+-------+-------+ | ||
* | ||
* In order to maintain the nparity+1 allocation size, a skip block (S0) was | ||
* added. For this 8k block, the pool's allocated and free properties are | ||
* adjusted by 16k and the dataset's referenced is increased by 16k * 128k / | ||
* 160k. Again, 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as calculated in | ||
* the 128k block example above. | ||
* | ||
* Compression may lead to a variety of block sizes being written for the same | ||
* volume or file. There is no clear way to reserve just the amount of space | ||
* that will be required, so the worst case (no compression) is assumed. | ||
* Note that metadata blocks will typically be compressed, so the reservation | ||
* size returned by zvol_volsize_to_reservation() will generally be slightly | ||
* larger than the maximum that the volume can reference. | ||
*/ | ||
|
||
/* | ||
* Derived from function of same name in module/zfs/vdev_raidz.c. Returns the | ||
* amount of space (in bytes) that will be allocated for the specified block | ||
* size. Note that the "referenced" space accounted will be less than this, but | ||
* not necessarily equal to "blksize", due to RAIDZ deflation. | ||
*/ | ||
static uint64_t | ||
vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift, | ||
uint64_t blksize) | ||
{ | ||
uint64_t asize, ndata; | ||
|
||
ASSERT3U(ndisks, >, nparity); | ||
ndata = ndisks - nparity; | ||
asize = ((blksize - 1) >> ashift) + 1; | ||
asize += nparity * ((asize + ndata - 1) / ndata); | ||
asize = roundup(asize, nparity + 1) << ashift; | ||
|
||
return (asize); | ||
} | ||
|
||
/* | ||
* Determine how much space will be allocated if it lands on the most space- | ||
* inefficient top-level vdev. Returns the size in bytes required to store one | ||
* copy of the volume data. See theory comment above. | ||
*/ | ||
static uint64_t | ||
volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize) | ||
{ | ||
nvlist_t *config, *tree, **vdevs; | ||
uint_t nvdevs, v; | ||
uint64_t ret = 0; | ||
|
||
config = zpool_get_config(zhp, NULL); | ||
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 || | ||
nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, | ||
&vdevs, &nvdevs) != 0) { | ||
return (nblocks * blksize); | ||
} | ||
|
||
for (v = 0; v < nvdevs; v++) { | ||
char *type; | ||
uint64_t nparity, ashift, asize, tsize; | ||
nvlist_t **disks; | ||
uint_t ndisks; | ||
uint64_t volsize; | ||
|
||
if (nvlist_lookup_string(vdevs[v], ZPOOL_CONFIG_TYPE, | ||
&type) != 0 || strcmp(type, VDEV_TYPE_RAIDZ) != 0 || | ||
nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_NPARITY, | ||
&nparity) != 0 || | ||
nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_ASHIFT, | ||
&ashift) != 0 || | ||
nvlist_lookup_nvlist_array(vdevs[v], ZPOOL_CONFIG_CHILDREN, | ||
&disks, &ndisks) != 0) { | ||
continue; | ||
} | ||
|
||
/* allocation size for the "typical" 128k block */ | ||
tsize = vdev_raidz_asize(ndisks, nparity, ashift, | ||
SPA_OLD_MAXBLOCKSIZE); | ||
/* allocation size for the blksize block */ | ||
asize = vdev_raidz_asize(ndisks, nparity, ashift, blksize); | ||
|
||
/* | ||
* Scale this size down as a ratio of 128k / tsize. See theory | ||
* statement above. | ||
*/ | ||
volsize = nblocks * asize * SPA_OLD_MAXBLOCKSIZE / tsize; | ||
if (volsize > ret) { | ||
ret = volsize; | ||
} | ||
} | ||
|
||
if (ret == 0) { | ||
ret = nblocks * blksize; | ||
} | ||
|
||
return (ret); | ||
} | ||
|
||
/* | ||
* Convert the zvol's volume size to an appropriate reservation. See theory | ||
* comment above. | ||
* | ||
* Note: If this routine is updated, it is necessary to update the ZFS test | ||
* suite's shell version in reservation.kshlib. | ||
* suite's shell version in reservation.shlib. | ||
*/ | ||
uint64_t | ||
zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props) | ||
zvol_volsize_to_reservation(zpool_handle_t *zph, uint64_t volsize, | ||
nvlist_t *props) | ||
{ | ||
uint64_t numdb; | ||
uint64_t nblocks, volblocksize; | ||
|
@@ -5397,7 +5563,14 @@ zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props) | |
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), | ||
&volblocksize) != 0) | ||
volblocksize = ZVOL_DEFAULT_BLOCKSIZE; | ||
nblocks = volsize/volblocksize; | ||
|
||
nblocks = volsize / volblocksize; | ||
/* | ||
* Metadata defaults to using 128k blocks, not volblocksize blocks. For | ||
* this reason, only the data blocks are scaled based on vdev config. | ||
*/ | ||
volsize = volsize_from_vdevs(zph, nblocks, volblocksize); | ||
|
||
/* start with metadnode L0-L6 */ | ||
numdb = 7; | ||
/* calculate number of indirects */ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.