Skip to content

Commit

Permalink
DLPX-51539 Improve I/O performance of metaslab spacemaps with a log s…
Browse files Browse the repository at this point in the history
…pacemap

DLPX-41227 metaslab_sync_done should not wait for metaslab_load to complete

Reviewed at: http://reviews.delphix.com/r/37308/
  • Loading branch information
sdimitro committed Mar 26, 2018
1 parent c4f3811 commit fe7bf6c
Show file tree
Hide file tree
Showing 53 changed files with 4,350 additions and 1,019 deletions.
360 changes: 275 additions & 85 deletions usr/src/cmd/mdb/common/modules/zfs/zfs.c

Large diffs are not rendered by default.

647 changes: 487 additions & 160 deletions usr/src/cmd/zdb/zdb.c

Large diffs are not rendered by default.

31 changes: 14 additions & 17 deletions usr/src/cmd/ztest/ztest.c
Original file line number Diff line number Diff line change
Expand Up @@ -2563,24 +2563,12 @@ vdev_lookup_by_path(vdev_t *vd, const char *path)
return (NULL);
}

/*
* Find the first available hole which can be used as a top-level.
*/
int
find_vdev_hole(spa_t *spa)
static int
spa_num_top_vdevs(spa_t *spa)
{
vdev_t *rvd = spa->spa_root_vdev;
int c;

ASSERT(spa_config_held(spa, SCL_VDEV, RW_READER) == SCL_VDEV);

for (c = 0; c < rvd->vdev_children; c++) {
vdev_t *cvd = rvd->vdev_child[c];

if (cvd->vdev_ishole)
break;
}
return (c);
ASSERT3U(spa_config_held(spa, SCL_VDEV, RW_READER), ==, SCL_VDEV);
return (rvd->vdev_children);
}

/*
Expand All @@ -2602,7 +2590,7 @@ ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)

spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);

ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves;
ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves;

/*
* If we have slogs then remove them 1/4 of the time.
Expand Down Expand Up @@ -6527,6 +6515,15 @@ ztest_init(ztest_shared_t *zs)
props = make_random_props();
for (int i = 0; i < SPA_FEATURES; i++) {
char buf[1024];

/*
* 75% chance of using the log space map feature. We want ztest
* to exercise both the code paths that use the log space map
* feature and the ones that don't.
*/
if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0)
continue;

(void) snprintf(buf, sizeof (buf), "feature@%s",
spa_feature_table[i].fi_uname);
VERIFY3U(0, ==, nvlist_add_uint64(props, buf, 0));
Expand Down
11 changes: 11 additions & 0 deletions usr/src/common/zfs/zfeature_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,17 @@ zpool_feature_init(void)
"Improved clone deletion performance.",
ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL);

static const spa_feature_t log_spacemap_deps[] = {
SPA_FEATURE_SPACEMAP_V2,
SPA_FEATURE_NONE
};
zfeature_register(SPA_FEATURE_LOG_SPACEMAP,
"com.delphix:log_spacemap", "log_spacemap",
"Log metaslab changes on a single spacemap and "
"flush them periodically.",
ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN,
log_spacemap_deps);

static const spa_feature_t large_blocks_deps[] = {
SPA_FEATURE_EXTENSIBLE_DATASET,
SPA_FEATURE_NONE
Expand Down
1 change: 1 addition & 0 deletions usr/src/common/zfs/zfeature_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ typedef enum spa_feature {
SPA_FEATURE_POOL_CHECKPOINT,
SPA_FEATURE_SPACEMAP_V2,
SPA_FEATURE_LIVELIST,
SPA_FEATURE_LOG_SPACEMAP,
SPA_FEATURES
} spa_feature_t;

Expand Down
4 changes: 2 additions & 2 deletions usr/src/man/man1m/zdb.1m
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
.\"
.\"
.\" Copyright 2012, Richard Lowe.
.\" Copyright (c) 2012, 2017 by Delphix. All rights reserved.
.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
.\" Copyright 2017 Nexenta Systems, Inc.
.\"
.Dd April 14, 2017
Expand Down Expand Up @@ -190,7 +190,7 @@ If the
.Fl u
option is also specified, also display the uberblocks on this device.
.It Fl L
Disable leak tracing and the loading of space maps.
Disable leak detection and the loading of space maps.
By default,
.Nm
verifies that all non-free blocks are referenced, which can be very expensive.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ function custom_cleanup
{
set_vdev_validate_skip 0
cleanup
log_must mdb_ctf_set_int vdev_min_ms_count 0t16
log_must mdb_ctf_set_int zfs_vdev_min_ms_count 0t16
log_must mdb_ctf_set_int spa_allocators 0t4
}

Expand Down Expand Up @@ -206,7 +206,8 @@ increase_device_sizes $(( FILE_SIZE * 4 ))

# Increase the number of metaslabs for small pools temporarily to
# reduce the chance of reusing a metaslab that holds old MOS metadata.
log_must mdb_ctf_set_int vdev_min_ms_count 0t150
log_must mdb_ctf_set_int zfs_vdev_min_ms_count 0t150

# Decrease the number of allocators for pools created during this test,
# to increase the odds that metadata survives from old txgs.
log_must mdb_ctf_set_int spa_allocators 0t1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#

#
# Copyright (c) 2016 by Delphix. All rights reserved.
# Copyright (c) 2016, 2018 by Delphix. All rights reserved.
#

. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib
Expand Down Expand Up @@ -64,6 +64,8 @@ function custom_cleanup
log_must set_zfs_txg_timeout $ZFS_TXG_TIMEOUT
log_must rm -rf $BACKUP_DEVICE_DIR
cleanup
log_must mdb_ctf_set_int zfs_vdev_min_ms_count 0t16
log_must mdb_ctf_set_int spa_allocators 0t4
}

log_onexit custom_cleanup
Expand Down Expand Up @@ -157,6 +159,14 @@ log_must mkdir $BACKUP_DEVICE_DIR
# Make the devices bigger to reduce chances of overwriting MOS metadata.
increase_device_sizes $(( FILE_SIZE * 4 ))

# Increase the number of metaslabs for small pools temporarily to
# reduce the chance of reusing a metaslab that holds old MOS metadata.
log_must mdb_ctf_set_int zfs_vdev_min_ms_count 0t150

# Decrease the number of allocators for pools created during this test,
# to increase the odds that metadata survives from old txgs.
log_must mdb_ctf_set_int spa_allocators 0t1

# We set zfs_txg_timeout to 1 to reduce resilvering time at each sync.
ZFS_TXG_TIMEOUT=$(get_zfs_txg_timeout)
set_zfs_txg_timeout 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,21 @@
#

#
# Copyright (c) 2017 by Delphix. All rights reserved.
# Copyright (c) 2017, 2018 by Delphix. All rights reserved.
#

. $STF_SUITE/tests/functional/pool_checkpoint/pool_checkpoint.kshlib

#
# DESCRIPTION:
# The maximum address that can be described by the current space
# map design (assuming the minimum 512-byte addressable storage)
# limits the maximum allocatable space of any top-level vdev to
# 64PB whenever a vdev-wide space map is used.
# The maximum address that can be described by a single-word
# space map entry limits the maximum allocatable space of any
# top-level vdev to 64PB whenever a vdev-wide space map is used.
#
# Since a vdev-wide space map is introduced for the checkpoint
# we want to ensure that we cannot checkpoint a pool that has a
# top-level vdev with more than 64PB of allocatable space.
# we want to ensure that we cannot checkpoint a pool that does
# not use the new space map encoding (V2) and has a top-level
# vdev with more than 64PB of allocatable space.
#
# Note: Since this is a pool created from file-based vdevs we
# are guaranteed that vdev_ashift is SPA_MINBLOCKSHIFT
Expand All @@ -35,12 +35,25 @@
#
# STRATEGY:
# 1. Create pool with a disk of exactly 64PB
# (so ~63.5PB of allocatable space)
# (so ~63.5PB of allocatable space) and
# ensure that has the checkpoint feature
# enabled but not space map V2.
# 2. Ensure that you can checkpoint it
# 3. Create pool with a disk of exactly 65PB
# (so ~64.5PB of allocatable space)
# (so ~64.5PB of allocatable space) with
# the same setup/
# 4. Ensure we fail trying to checkpoint it
#
# Note:
# This test used to create the two pools and attempt to checkpoint
# them at the same time, then destroy them. We later had to change
# this to test one pool and then destroy it at a time, as the
# metaslabs (even though empty) consumed a lot of memory, especially
# on a machine that has been running with kmem_flags on. To give
# an example, each metaslab structure is 1712 bytes (at the time of
# this writing), and each vdev has 128K metaslabs, which means that
# just the structures consume 131071 * 1712 = ~224M.
#

verify_runnable "global"

Expand All @@ -65,10 +78,14 @@ log_must zfs create $DISKFS
log_must mkfile -n $((64 * 1024 * 1024))g $DISK64PB
log_must mkfile -n $((65 * 1024 * 1024))g $DISK65PB

log_must zpool create $TESTPOOL1 $DISK64PB
log_must zpool create $TESTPOOL2 $DISK65PB

log_must zpool create -d $TESTPOOL1 $DISK64PB
log_must zpool set feature@zpool_checkpoint=enabled $TESTPOOL1
log_must zpool checkpoint $TESTPOOL1
destroy_pool $TESTPOOL1

log_must zpool create -d $TESTPOOL2 $DISK65PB
log_must zpool set feature@zpool_checkpoint=enabled $TESTPOOL2
log_mustnot zpool checkpoint $TESTPOOL2
destroy_pool $TESTPOOL2

log_pass "Attempting to checkpoint a pool with a vdev that's more than 64PB."
Original file line number Diff line number Diff line change
Expand Up @@ -314,8 +314,19 @@ function fragment_after_checkpoint_and_verify
#
log_must zpool list -v

log_must zdb $NESTEDPOOL
log_must zdb -kc $NESTEDPOOL
#
# Typically we would just run zdb at this point and things
# would be fine. Unfortunately, if there is still any
# background I/O in the pool the zdb command can fail with
# checksum errors temporarily.
#
# Export the pool when running zdb so the pool is idle and
# the verification results are consistent.
#
log_must zpool export $NESTEDPOOL
log_must zdb -e -p $FILEDISKDIR $NESTEDPOOL
log_must zdb -e -p $FILEDISKDIR -kc $NESTEDPOOL
log_must zpool import -d $FILEDISKDIR $NESTEDPOOL
}

#
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#

#
# Copyright (c) 2015, 2016 by Delphix. All rights reserved.
# Copyright (c) 2015, 2018 by Delphix. All rights reserved.
#

. $STF_SUITE/include/libtest.shlib
Expand Down Expand Up @@ -91,13 +91,13 @@ mapping_size_after=$(indirect_vdev_mapping_size $TESTPOOL)

#
# After the remap, there should not be very many blocks referenced. The reason
# why our threshold is as high as 512 is because our ratio of metadata to
# why our threshold is as high as 2048 is because our ratio of metadata to
# user data is relatively high, with only 64M of user data on the file system.
#
(( mapping_size_after < mapping_size_before )) || \
log_fail "Mapping size did not decrease after remap: " \
"$mapping_size_before before to $mapping_size_after after."
(( mapping_size_after < 512 )) || \
(( mapping_size_after < 2048 )) || \
log_fail "Mapping size not small enough after remap: " \
"$mapping_size_before before to $mapping_size_after after."

Expand Down
3 changes: 2 additions & 1 deletion usr/src/uts/common/Makefile.files
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

#
# Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
# Copyright (c) 2011, 2017 by Delphix. All rights reserved.
# Copyright (c) 2011, 2018 by Delphix. All rights reserved.
# Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
# Copyright 2015 Nexenta Systems, Inc. All rights reserved.
# Copyright 2016 Garrett D'Amore <[email protected]>
Expand Down Expand Up @@ -1439,6 +1439,7 @@ ZFS_COMMON_OBJS += \
spa_config.o \
spa_errlog.o \
spa_history.o \
spa_log_spacemap.o \
spa_misc.o \
space_map.o \
space_reftree.o \
Expand Down
1 change: 0 additions & 1 deletion usr/src/uts/common/fs/zfs/arc.c
Original file line number Diff line number Diff line change
Expand Up @@ -4351,7 +4351,6 @@ arc_adapt(int bytes, arc_state_t *state)
return;
}


if (arc_no_grow)
return;

Expand Down
32 changes: 32 additions & 0 deletions usr/src/uts/common/fs/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -3448,6 +3448,33 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
zio_nowait(zio);
}

/*
* Verify that the size of the data in our bonus buffer does not exceed
* its recorded size.
*
* The purpose of this verification is to catch any cases in development
* where the size of a phys structure (i.e space_map_phys_t) grows and,
* due to incorrect feature management, older pools expect to read more
* data even though they didn't actually write it to begin with.
*
* For a example, this would catch an error in the feature logic where we
* open an older pool and we expect to write the space map histogram of
* a space map with size SPACE_MAP_SIZE_V0.
*/
static void
dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr)
{
arc_buf_t *datap = dr->dt.dl.dr_data;
uint16_t bonuslen = DB_DNODE(dr->dr_dbuf)->dn_phys->dn_bonuslen;

ASSERT3U(DN_MAX_BONUSLEN, >=, bonuslen);
char *datap_end = ((char *)datap) + bonuslen;
char *datap_max = ((char *)datap) + DN_MAX_BONUSLEN;

for (; datap_end < datap_max; datap_end++)
ASSERT(*datap_end == 0);
}

static void
dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
{
Expand Down Expand Up @@ -3498,9 +3525,14 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
ASSERT(*datap != NULL);
ASSERT0(db->db_level);
ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);

bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
DB_DNODE_EXIT(db);

#ifdef DEBUG
dbuf_sync_leaf_verify_bonus_dnode(dr);
#endif

if (*datap != db->db.db_data) {
zio_buf_free(*datap, DN_MAX_BONUSLEN);
arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
Expand Down
4 changes: 2 additions & 2 deletions usr/src/uts/common/fs/zfs/dmu_objset.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
Expand Down Expand Up @@ -1296,7 +1296,7 @@ dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx)
ASSERT(dn->dn_dbuf->db_data_pending);
/*
* Initialize dn_zio outside dnode_sync() because the
* meta-dnode needs to set it ouside dnode_sync().
* meta-dnode needs to set it outside dnode_sync().
*/
dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
ASSERT(dn->dn_zio);
Expand Down
10 changes: 9 additions & 1 deletion usr/src/uts/common/fs/zfs/dnode.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright 2017 RackTop Systems.
Expand Down Expand Up @@ -362,6 +362,14 @@ dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
ASSERT3U(newsize, <=, DN_MAX_BONUSLEN -
(dn->dn_nblkptr-1) * sizeof (blkptr_t));

if (newsize < dn->dn_bonuslen) {
/* clear any data after the end of the new size */
size_t diff = dn->dn_bonuslen - newsize;
char *data_end = ((char *)dn->dn_bonus->db.db_data) + newsize;
bzero(data_end, diff);
}

dn->dn_bonuslen = newsize;
if (newsize == 0)
dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
Expand Down
2 changes: 1 addition & 1 deletion usr/src/uts/common/fs/zfs/dsl_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -689,7 +689,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
dp->dp_mos_uncompressed_delta = 0;
}

if (!multilist_is_empty(mos->os_dirty_dnodes[txg & TXG_MASK])) {
if (dmu_objset_is_dirty(mos, txg)) {
dsl_pool_sync_mos(dp, tx);
}

Expand Down
Loading

0 comments on commit fe7bf6c

Please sign in to comment.