Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

vol_volsize_to_reservation does not account for raidz skip blocks #8973

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions cmd/zfs/zfs_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
* Copyright 2016 Nexenta Systems, Inc.
* Copyright (c) 2019 Datto Inc.
* Copyright (c) 2019, loli10K <[email protected]>
* Copyright 2019 Joyent, Inc.
*/

#include <assert.h>
Expand Down Expand Up @@ -998,10 +999,11 @@ zfs_do_create(int argc, char **argv)
zpool_close(zpool_handle);
goto error;
}
zpool_close(zpool_handle);

volsize = zvol_volsize_to_reservation(volsize, real_props);
volsize = zvol_volsize_to_reservation(zpool_handle, volsize,
real_props);
nvlist_free(real_props);
zpool_close(zpool_handle);

if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop),
&strval) != 0) {
Expand Down
5 changes: 3 additions & 2 deletions include/libzfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright Joyent, Inc.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
* Copyright (c) 2016, Intel Corporation.
* Copyright 2016 Nexenta Systems, Inc.
Expand Down Expand Up @@ -688,7 +688,8 @@ extern int zfs_hold(zfs_handle_t *, const char *, const char *,
extern int zfs_hold_nvl(zfs_handle_t *, int, nvlist_t *);
extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t);
extern int zfs_get_holds(zfs_handle_t *, nvlist_t **);
extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *);
extern uint64_t zvol_volsize_to_reservation(zpool_handle_t *, uint64_t,
nvlist_t *);

typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain,
uid_t rid, uint64_t space);
Expand Down
189 changes: 181 additions & 8 deletions lib/libzfs/libzfs_dataset.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2018, Joyent, Inc. All rights reserved.
* Copyright 2019 Joyent, Inc.
* Copyright (c) 2011, 2018 by Delphix. All rights reserved.
* Copyright (c) 2012 DEY Storage Systems, Inc. All rights reserved.
* Copyright (c) 2012 Pawel Jakub Dawidek <[email protected]>.
Expand Down Expand Up @@ -1617,6 +1617,7 @@ zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl)
uint64_t new_reservation;
zfs_prop_t resv_prop;
nvlist_t *props;
zpool_handle_t *zph = zpool_handle(zhp);

/*
* If this is an existing volume, and someone is setting the volsize,
Expand All @@ -1631,7 +1632,7 @@ zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl)
fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE));

if ((zvol_volsize_to_reservation(old_volsize, props) !=
if ((zvol_volsize_to_reservation(zph, old_volsize, props) !=
old_reservation) || nvlist_exists(nvl,
zfs_prop_to_name(resv_prop))) {
fnvlist_free(props);
Expand All @@ -1642,7 +1643,7 @@ zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl)
fnvlist_free(props);
return (-1);
}
new_reservation = zvol_volsize_to_reservation(new_volsize, props);
new_reservation = zvol_volsize_to_reservation(zph, new_volsize, props);
fnvlist_free(props);

if (nvlist_add_uint64(nvl, zfs_prop_to_name(resv_prop),
Expand Down Expand Up @@ -1697,7 +1698,8 @@ zfs_fix_auto_resv(zfs_handle_t *zhp, nvlist_t *nvl)
volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
}

resvsize = zvol_volsize_to_reservation(volsize, props);
resvsize = zvol_volsize_to_reservation(zpool_handle(zhp), volsize,
props);
fnvlist_free(props);

(void) nvlist_remove_all(nvl, zfs_prop_to_name(prop));
Expand Down Expand Up @@ -5376,12 +5378,176 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl)
}

/*
* Convert the zvol's volume size to an appropriate reservation.
* The theory of raidz space accounting
*
* The "referenced" property of RAIDZ vdevs is scaled such that a 128KB block
* will "reference" 128KB, even though it allocates more than that, to store the
* parity information (and perhaps skip sectors). This concept of the
* "referenced" (and other DMU space accounting) being lower than the allocated
* space by a constant factor is called "raidz deflation."
*
* As mentioned above, the constant factor for raidz deflation assumes a 128KB
* block size. However, zvols typically have a much smaller block size (default
* 8KB). These smaller blocks may require proportionally much more parity
* information (and perhaps skip sectors). In this case, the change to the
* "referenced" property may be much more than the logical block size.
*
* Suppose a raidz vdev has 5 disks with ashift=12. A 128k block may be written
* as follows.
*
* +-------+-------+-------+-------+-------+
* | disk1 | disk2 | disk3 | disk4 | disk5 |
* +-------+-------+-------+-------+-------+
* | P0 | D0 | D8 | D16 | D24 |
* | P1 | D1 | D9 | D17 | D25 |
* | P2 | D2 | D10 | D18 | D26 |
* | P3 | D3 | D11 | D19 | D27 |
* | P4 | D4 | D12 | D20 | D28 |
* | P5 | D5 | D13 | D21 | D29 |
* | P6 | D6 | D14 | D22 | D30 |
* | P7 | D7 | D15 | D23 | D31 |
* +-------+-------+-------+-------+-------+
*
* Above, notice that 160k was allocated: 8 x 4k parity sectors + 32 x 4k data
* sectors. The dataset's referenced will increase by 128k and the pool's
* allocated and free properties will be adjusted by 160k.
*
* A 4k block written to the same raidz vdev will require two 4k sectors. The
* blank cells represent unallocated space.
*
* +-------+-------+-------+-------+-------+
* | disk1 | disk2 | disk3 | disk4 | disk5 |
* +-------+-------+-------+-------+-------+
* | P0 | D0 | | | |
* +-------+-------+-------+-------+-------+
*
* Above, notice that the 4k block required one sector for parity and another
* for data. vdev_raidz_asize() will return 8k and as such the pool's allocated
* and free properties will be adjusted by 8k. The dataset will not be charged
* 8k. Rather, it will be charged a value that is scaled according to the
* overhead of the 128k block on the same vdev. This 8k allocation will be
* charged 8k * 128k / 160k. 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as
* calculated in the 128k block example above.
*
* Every raidz allocation is sized to be a multiple of nparity+1 sectors. That
* is, every raidz1 allocation will be a multiple of 2 sectors, raidz2
* allocations are a multiple of 3 sectors, and raidz3 allocations are a
* multiple of of 4 sectors. When a block does not fill the required number of
* sectors, skip blocks (sectors) are used.
*
* An 8k block being written to a raidz vdev may be written as follows:
*
* +-------+-------+-------+-------+-------+
* | disk1 | disk2 | disk3 | disk4 | disk5 |
* +-------+-------+-------+-------+-------+
* | P0 | D0 | D1 | S0 | |
* +-------+-------+-------+-------+-------+
*
* In order to maintain the nparity+1 allocation size, a skip block (S0) was
* added. For this 8k block, the pool's allocated and free properties are
* adjusted by 16k and the dataset's referenced is increased by 16k * 128k /
* 160k. Again, 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as calculated in
* the 128k block example above.
*
* Compression may lead to a variety of block sizes being written for the same
* volume or file. There is no clear way to reserve just the amount of space
* that will be required, so the worst case (no compression) is assumed.
* Note that metadata blocks will typically be compressed, so the reservation
* size returned by zvol_volsize_to_reservation() will generally be slightly
* larger than the maximum that the volume can reference.
*/

/*
* Derived from function of same name in module/zfs/vdev_raidz.c. Returns the
* amount of space (in bytes) that will be allocated for the specified block
* size. Note that the "referenced" space accounted will be less than this, but
* not necessarily equal to "blksize", due to RAIDZ deflation.
*/
static uint64_t
vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
uint64_t blksize)
{
uint64_t asize, ndata;

ASSERT3U(ndisks, >, nparity);
ndata = ndisks - nparity;
asize = ((blksize - 1) >> ashift) + 1;
asize += nparity * ((asize + ndata - 1) / ndata);
asize = roundup(asize, nparity + 1) << ashift;

return (asize);
}

/*
* Determine how much space will be allocated if it lands on the most space-
* inefficient top-level vdev. Returns the size in bytes required to store one
* copy of the volume data. See theory comment above.
*/
static uint64_t
volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize)
{
nvlist_t *config, *tree, **vdevs;
uint_t nvdevs, v;
uint64_t ret = 0;

config = zpool_get_config(zhp, NULL);
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 ||
nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN,
&vdevs, &nvdevs) != 0) {
return (nblocks * blksize);
}

for (v = 0; v < nvdevs; v++) {
char *type;
uint64_t nparity, ashift, asize, tsize;
nvlist_t **disks;
uint_t ndisks;
uint64_t volsize;

if (nvlist_lookup_string(vdevs[v], ZPOOL_CONFIG_TYPE,
&type) != 0 || strcmp(type, VDEV_TYPE_RAIDZ) != 0 ||
nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_NPARITY,
&nparity) != 0 ||
nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_ASHIFT,
&ashift) != 0 ||
nvlist_lookup_nvlist_array(vdevs[v], ZPOOL_CONFIG_CHILDREN,
&disks, &ndisks) != 0) {
continue;
}

/* allocation size for the "typical" 128k block */
tsize = vdev_raidz_asize(ndisks, nparity, ashift,
SPA_OLD_MAXBLOCKSIZE);
/* allocation size for the blksize block */
asize = vdev_raidz_asize(ndisks, nparity, ashift, blksize);

/*
* Scale this size down as a ratio of 128k / tsize. See theory
* statement above.
*/
volsize = nblocks * asize * SPA_OLD_MAXBLOCKSIZE / tsize;
if (volsize > ret) {
ret = volsize;
}
}

if (ret == 0) {
ret = nblocks * blksize;
}

return (ret);
}

/*
* Convert the zvol's volume size to an appropriate reservation. See theory
* comment above.
*
* Note: If this routine is updated, it is necessary to update the ZFS test
* suite's shell version in reservation.kshlib.
* suite's shell version in reservation.shlib.
*/
uint64_t
zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props)
zvol_volsize_to_reservation(zpool_handle_t *zph, uint64_t volsize,
nvlist_t *props)
{
uint64_t numdb;
uint64_t nblocks, volblocksize;
Expand All @@ -5397,7 +5563,14 @@ zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props)
zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
&volblocksize) != 0)
volblocksize = ZVOL_DEFAULT_BLOCKSIZE;
nblocks = volsize/volblocksize;

nblocks = volsize / volblocksize;
/*
* Metadata defaults to using 128k blocks, not volblocksize blocks. For
* this reason, only the data blocks are scaled based on vdev config.
*/
volsize = volsize_from_vdevs(zph, nblocks, volblocksize);

/* start with metadnode L0-L6 */
numdb = 7;
/* calculate number of indirects */
Expand Down
3 changes: 2 additions & 1 deletion tests/runfiles/linux.run
Original file line number Diff line number Diff line change
Expand Up @@ -756,7 +756,8 @@ tags = ['functional', 'refquota']

[tests/functional/refreserv]
tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos',
'refreserv_004_pos', 'refreserv_005_pos']
'refreserv_004_pos', 'refreserv_005_pos', 'refreserv_multi_raidz',
'refreserv_raidz']
tags = ['functional', 'refreserv']

[tests/functional/removal]
Expand Down
4 changes: 3 additions & 1 deletion tests/zfs-tests/tests/functional/refreserv/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ dist_pkgdata_SCRIPTS = \
refreserv_002_pos.ksh \
refreserv_003_pos.ksh \
refreserv_004_pos.ksh \
refreserv_005_pos.ksh
refreserv_005_pos.ksh \
refreserv_multi_raidz.ksh \
refreserv_raidz.ksh

dist_pkgdata_DATA = \
refreserv.cfg
Loading