Skip to content

Commit

Permalink
7090 zfs should improve allocation order and throttle allocations
Browse files Browse the repository at this point in the history
Reviewed by: Adam Leventhal <[email protected]>
Reviewed by: Alex Reece <[email protected]>
Reviewed by: Christopher Siden <[email protected]>
Reviewed by: Dan Kimmel <[email protected]>
Reviewed by: Matthew Ahrens <[email protected]>
Reviewed by: Paul Dagnelie <[email protected]>
Reviewed by: Prakash Surya <[email protected]>
Reviewed by: Sebastien Roy <[email protected]>

When write I/Os are issued, they are issued in block order but the ZIO
pipeline will drive them asynchronously through the allocation stage
which can result in blocks being allocated out-of-order. It would be
nice to preserve as much of the logical order as possible.

In addition, the allocations are equally scattered across all top-level
VDEVs but not all top-level VDEVs are created equally. The pipeline
should be able to detect devices that are more capable of handling
allocations and should allocate more blocks to those devices. This
allows for dynamic allocation distribution when devices are imbalanced
as fuller devices will tend to be slower than empty devices.

The change includes a new pool-wide allocation queue which would
throttle and order allocations in the ZIO pipeline. The queue would be
ordered by issued time and offset and would provide an initial amount of
allocation of work to each top-level vdev. The allocation logic utilizes
a reservation system to reserve allocations that will be performed by
the allocator. Once an allocation is successfully completed it's
scheduled on a given top-level vdev. Each top-level vdev maintains a
maximum number of allocations that it can handle (mg_alloc_queue_depth).
The pool-wide reserved allocations (top-levels * mg_alloc_queue_depth)
are distributed across the top-level vdevs metaslab groups and round
robin across all eligible metaslab groups to distribute the work. As
top-levels complete their work, they receive additional work from the
pool-wide allocation queue until the allocation queue is emptied.

Closes #130
  • Loading branch information
grwilson authored and ahrens committed Aug 2, 2016
1 parent ec740b0 commit 4756c3d
Show file tree
Hide file tree
Showing 17 changed files with 1,012 additions and 209 deletions.
329 changes: 278 additions & 51 deletions usr/src/uts/common/fs/zfs/metaslab.c

Large diffs are not rendered by default.

65 changes: 64 additions & 1 deletion usr/src/uts/common/fs/zfs/refcount.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/

#include <sys/zfs_context.h>
Expand Down Expand Up @@ -68,6 +68,13 @@ refcount_create(refcount_t *rc)
rc->rc_tracked = reference_tracking_enable;
}

void
refcount_create_tracked(refcount_t *rc)
{
refcount_create(rc);
rc->rc_tracked = B_TRUE;
}

void
refcount_create_untracked(refcount_t *rc)
{
Expand Down Expand Up @@ -251,4 +258,60 @@ refcount_transfer_ownership(refcount_t *rc, void *current_holder,
ASSERT(found);
mutex_exit(&rc->rc_mtx);
}

/*
* If tracking is enabled, return true if a reference exists that matches
* the "holder" tag. If tracking is disabled, then return true if a reference
* might be held.
*/
boolean_t
refcount_held(refcount_t *rc, void *holder)
{
reference_t *ref;

mutex_enter(&rc->rc_mtx);

if (!rc->rc_tracked) {
mutex_exit(&rc->rc_mtx);
return (rc->rc_count > 0);
}

for (ref = list_head(&rc->rc_list); ref;
ref = list_next(&rc->rc_list, ref)) {
if (ref->ref_holder == holder) {
mutex_exit(&rc->rc_mtx);
return (B_TRUE);
}
}
mutex_exit(&rc->rc_mtx);
return (B_FALSE);
}

/*
* If tracking is enabled, return true if a reference does not exist that
* matches the "holder" tag. If tracking is disabled, always return true
* since the reference might not be held.
*/
boolean_t
refcount_not_held(refcount_t *rc, void *holder)
{
reference_t *ref;

mutex_enter(&rc->rc_mtx);

if (!rc->rc_tracked) {
mutex_exit(&rc->rc_mtx);
return (B_TRUE);
}

for (ref = list_head(&rc->rc_list); ref;
ref = list_next(&rc->rc_list, ref)) {
if (ref->ref_holder == holder) {
mutex_exit(&rc->rc_mtx);
return (B_FALSE);
}
}
mutex_exit(&rc->rc_mtx);
return (B_TRUE);
}
#endif /* ZFS_DEBUG */
47 changes: 45 additions & 2 deletions usr/src/uts/common/fs/zfs/spa.c
Original file line number Diff line number Diff line change
Expand Up @@ -1275,7 +1275,6 @@ spa_unload(spa_t *spa)

ddt_unload(spa);


/*
* Drop and purge level 2 cache
*/
Expand Down Expand Up @@ -3634,6 +3633,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
spa->spa_uberblock.ub_txg = txg - 1;
spa->spa_uberblock.ub_version = version;
spa->spa_ubsync = spa->spa_uberblock;
spa->spa_load_state = SPA_LOAD_CREATE;

/*
* Create "The Godfather" zio to hold all async IOs
Expand Down Expand Up @@ -3818,6 +3818,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
*/
spa_evicting_os_wait(spa);
spa->spa_minref = refcount_count(&spa->spa_refcount);
spa->spa_load_state = SPA_LOAD_NONE;

mutex_exit(&spa_namespace_lock);

Expand Down Expand Up @@ -5321,7 +5322,7 @@ spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)

static void
spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
nvlist_t *dev_to_remove)
nvlist_t *dev_to_remove)
{
nvlist_t **newdev = NULL;

Expand Down Expand Up @@ -6483,6 +6484,8 @@ spa_sync(spa_t *spa, uint64_t txg)
vdev_t *vd;
dmu_tx_t *tx;
int error;
uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
zfs_vdev_queue_depth_pct / 100;

VERIFY(spa_writeable(spa));

Expand All @@ -6494,6 +6497,10 @@ spa_sync(spa_t *spa, uint64_t txg)
spa->spa_syncing_txg = txg;
spa->spa_sync_pass = 0;

mutex_enter(&spa->spa_alloc_lock);
VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
mutex_exit(&spa->spa_alloc_lock);

/*
* If there are any pending vdev state changes, convert them
* into config changes that go out with this transaction group.
Expand Down Expand Up @@ -6545,6 +6552,38 @@ spa_sync(spa_t *spa, uint64_t txg)
}
}

/*
* Set the top-level vdev's max queue depth. Evaluate each
* top-level's async write queue depth in case it changed.
* The max queue depth will not change in the middle of syncing
* out this txg.
*/
uint64_t queue_depth_total = 0;
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
metaslab_group_t *mg = tvd->vdev_mg;

if (mg == NULL || mg->mg_class != spa_normal_class(spa) ||
!metaslab_group_initialized(mg))
continue;

/*
* It is safe to do a lock-free check here because only async
* allocations look at mg_max_alloc_queue_depth, and async
* allocations all happen from spa_sync().
*/
ASSERT0(refcount_count(&mg->mg_alloc_queue_depth));
mg->mg_max_alloc_queue_depth = max_queue_depth;
queue_depth_total += mg->mg_max_alloc_queue_depth;
}
metaslab_class_t *mc = spa_normal_class(spa);
ASSERT0(refcount_count(&mc->mc_alloc_slots));
mc->mc_alloc_max_slots = queue_depth_total;
mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;

ASSERT3U(mc->mc_alloc_max_slots, <=,
max_queue_depth * rvd->vdev_children);

/*
* Iterate to convergence.
*/
Expand Down Expand Up @@ -6696,6 +6735,10 @@ spa_sync(spa_t *spa, uint64_t txg)

dsl_pool_sync_done(dp, txg);

mutex_enter(&spa->spa_alloc_lock);
VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
mutex_exit(&spa->spa_alloc_lock);

/*
* Update usable space statistics.
*/
Expand Down
6 changes: 6 additions & 0 deletions usr/src/uts/common/fs/zfs/spa_misc.c
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_iokstat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_alloc_lock, NULL, MUTEX_DEFAULT, NULL);

cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
Expand Down Expand Up @@ -619,6 +620,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa_active_count++;
}

avl_create(&spa->spa_alloc_tree, zio_timestamp_compare,
sizeof (zio_t), offsetof(zio_t, io_alloc_node));

/*
* Every pool starts with the default cachefile
*/
Expand Down Expand Up @@ -704,6 +708,7 @@ spa_remove(spa_t *spa)
kmem_free(dp, sizeof (spa_config_dirent_t));
}

avl_destroy(&spa->spa_alloc_tree);
list_destroy(&spa->spa_config_list);

nvlist_free(spa->spa_label_features);
Expand Down Expand Up @@ -734,6 +739,7 @@ spa_remove(spa_t *spa)
cv_destroy(&spa->spa_scrub_io_cv);
cv_destroy(&spa->spa_suspend_cv);

mutex_destroy(&spa->spa_alloc_lock);
mutex_destroy(&spa->spa_async_lock);
mutex_destroy(&spa->spa_errlist_lock);
mutex_destroy(&spa->spa_errlog_lock);
Expand Down
21 changes: 14 additions & 7 deletions usr/src/uts/common/fs/zfs/sys/metaslab.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
*/

#ifndef _SYS_METASLAB_H
Expand Down Expand Up @@ -55,14 +55,15 @@ void metaslab_sync_done(metaslab_t *, uint64_t);
void metaslab_sync_reassess(metaslab_group_t *);
uint64_t metaslab_block_maxsize(metaslab_t *);

#define METASLAB_HINTBP_FAVOR 0x0
#define METASLAB_HINTBP_AVOID 0x1
#define METASLAB_GANG_HEADER 0x2
#define METASLAB_GANG_CHILD 0x4
#define METASLAB_GANG_AVOID 0x8
#define METASLAB_HINTBP_FAVOR 0x0
#define METASLAB_HINTBP_AVOID 0x1
#define METASLAB_GANG_HEADER 0x2
#define METASLAB_GANG_CHILD 0x4
#define METASLAB_ASYNC_ALLOC 0x8
#define METASLAB_DONT_THROTTLE 0x10

int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
blkptr_t *, int, uint64_t, blkptr_t *, int);
blkptr_t *, int, uint64_t, blkptr_t *, int, zio_t *);
void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
int metaslab_claim(spa_t *, const blkptr_t *, uint64_t);
void metaslab_check_free(spa_t *, const blkptr_t *);
Expand All @@ -73,6 +74,9 @@ int metaslab_class_validate(metaslab_class_t *);
void metaslab_class_histogram_verify(metaslab_class_t *);
uint64_t metaslab_class_fragmentation(metaslab_class_t *);
uint64_t metaslab_class_expandable_space(metaslab_class_t *);
boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int,
zio_t *, int);
void metaslab_class_throttle_unreserve(metaslab_class_t *, int, zio_t *);

void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t,
int64_t, int64_t);
Expand All @@ -85,10 +89,13 @@ metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *);
void metaslab_group_destroy(metaslab_group_t *);
void metaslab_group_activate(metaslab_group_t *);
void metaslab_group_passivate(metaslab_group_t *);
boolean_t metaslab_group_initialized(metaslab_group_t *);
uint64_t metaslab_group_get_space(metaslab_group_t *);
void metaslab_group_histogram_verify(metaslab_group_t *);
uint64_t metaslab_group_fragmentation(metaslab_group_t *);
void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int);
void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *);

#ifdef __cplusplus
}
Expand Down
63 changes: 62 additions & 1 deletion usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
*/

/*
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
*/

#ifndef _SYS_METASLAB_IMPL_H
Expand Down Expand Up @@ -59,11 +59,42 @@ extern "C" {
* to use a block allocator that best suits that class.
*/
struct metaslab_class {
kmutex_t mc_lock;
spa_t *mc_spa;
metaslab_group_t *mc_rotor;
metaslab_ops_t *mc_ops;
uint64_t mc_aliquot;

/*
* Track the number of metaslab groups that have been initialized
* and can accept allocations. An initialized metaslab group is
* one has been completely added to the config (i.e. we have
* updated the MOS config and the space has been added to the pool).
*/
uint64_t mc_groups;

/*
* Toggle to enable/disable the allocation throttle.
*/
boolean_t mc_alloc_throttle_enabled;

/*
* The allocation throttle works on a reservation system. Whenever
* an asynchronous zio wants to perform an allocation it must
* first reserve the number of blocks that it wants to allocate.
* If there aren't sufficient slots available for the pending zio
* then that I/O is throttled until more slots free up. The current
* number of reserved allocations is maintained by the mc_alloc_slots
* refcount. The mc_alloc_max_slots value determines the maximum
* number of allocations that the system allows. Gang blocks are
* allowed to reserve slots even if we've reached the maximum
* number of allocations allowed.
*/
uint64_t mc_alloc_max_slots;
refcount_t mc_alloc_slots;

uint64_t mc_alloc_groups; /* # of allocatable groups */

uint64_t mc_alloc; /* total allocated space */
uint64_t mc_deferred; /* total deferred frees */
uint64_t mc_space; /* total space (alloc + free) */
Expand All @@ -85,6 +116,15 @@ struct metaslab_group {
avl_tree_t mg_metaslab_tree;
uint64_t mg_aliquot;
boolean_t mg_allocatable; /* can we allocate? */

/*
* A metaslab group is considered to be initialized only after
* we have updated the MOS config and added the space to the pool.
* We only allow allocation attempts to a metaslab group if it
* has been initialized.
*/
boolean_t mg_initialized;

uint64_t mg_free_capacity; /* percentage free */
int64_t mg_bias;
int64_t mg_activation_count;
Expand All @@ -93,6 +133,27 @@ struct metaslab_group {
taskq_t *mg_taskq;
metaslab_group_t *mg_prev;
metaslab_group_t *mg_next;

/*
* Each metaslab group can handle mg_max_alloc_queue_depth allocations
* which are tracked by mg_alloc_queue_depth. It's possible for a
* metaslab group to handle more allocations than its max. This
* can occur when gang blocks are required or when other groups
* are unable to handle their share of allocations.
*/
uint64_t mg_max_alloc_queue_depth;
refcount_t mg_alloc_queue_depth;

/*
* A metalab group that can no longer allocate the minimum block
* size will set mg_no_free_space. Once a metaslab group is out
* of space then its share of work must be distributed to other
* groups.
*/
boolean_t mg_no_free_space;

uint64_t mg_allocations;
uint64_t mg_failed_allocations;
uint64_t mg_fragmentation;
uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
};
Expand Down
Loading

0 comments on commit 4756c3d

Please sign in to comment.