Skip to content

Commit

Permalink
1051 zfs should handle imbalanced luns
Browse files Browse the repository at this point in the history
Reviewed by: Eric Schrock <[email protected]>
Reviewed by: Matt Ahrens <[email protected]>
Reviewed by: Adam Leventhal <[email protected]>
Reviewed by: Albert Lee <[email protected]>
Reviewed by: Gordon Ross <[email protected]>
Approved by: Garrett D'Amore <[email protected]>
  • Loading branch information
grwilson committed May 29, 2011
1 parent 23a9c29 commit 09c9d37
Show file tree
Hide file tree
Showing 8 changed files with 123 additions and 28 deletions.
2 changes: 2 additions & 0 deletions usr/src/cmd/ztest/ztest.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 by Delphix. All rights reserved.
*/

/*
Expand Down Expand Up @@ -5131,6 +5132,7 @@ ztest_run(ztest_shared_t *zs)
*/
kernel_init(FREAD | FWRITE);
VERIFY(spa_open(zs->zs_pool, &spa, FTAG) == 0);
spa->spa_debug = B_TRUE;
zs->zs_spa = spa;

spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN;
Expand Down
105 changes: 78 additions & 27 deletions usr/src/uts/common/fs/zfs/metaslab.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 by Delphix. All rights reserved.
*/

#include <sys/zfs_context.h>
Expand All @@ -30,9 +31,28 @@
#include <sys/vdev_impl.h>
#include <sys/zio.h>

/*
* Allow allocations to switch to gang blocks quickly. We do this to
* avoid having to load lots of space_maps in a given txg. There are,
* however, some cases where we want to avoid "fast" ganging and instead
* we want to do an exhaustive search of all metaslabs on this device.
* Currently we don't allow any gang or dump device related allocations
* to "fast" gang.
*/
#define CAN_FASTGANG(flags) \
(!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
METASLAB_GANG_AVOID)))

uint64_t metaslab_aliquot = 512ULL << 10;
uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */

/*
* This value defines the number of allowed allocation failures per vdev.
* If a device reaches this threshold in a given txg then we consider skipping
* allocations on that device.
*/
int zfs_mg_alloc_failures;

/*
* Metaslab debugging: when set, keeps all space maps in core to verify frees.
*/
Expand Down Expand Up @@ -671,7 +691,7 @@ static space_map_ops_t metaslab_ndf_ops = {
metaslab_ndf_fragmented
};

space_map_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops;
space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;

/*
* ==========================================================================
Expand Down Expand Up @@ -844,7 +864,7 @@ metaslab_prefetch(metaslab_group_t *mg)
}

static int
metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
{
metaslab_group_t *mg = msp->ms_group;
space_map_t *sm = &msp->ms_map;
Expand Down Expand Up @@ -877,13 +897,6 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
mutex_exit(&mg->mg_lock);
}

/*
* If we were able to load the map then make sure
* that this map is still able to satisfy our request.
*/
if (msp->ms_weight < size)
return (ENOSPC);

metaslab_group_sort(msp->ms_group, msp,
msp->ms_weight | activation_weight);
}
Expand Down Expand Up @@ -1099,6 +1112,7 @@ void
metaslab_sync_reassess(metaslab_group_t *mg)
{
vdev_t *vd = mg->mg_vd;
int64_t failures = mg->mg_alloc_failures;

/*
* Re-evaluate all metaslabs which have lower offsets than the
Expand All @@ -1115,6 +1129,8 @@ metaslab_sync_reassess(metaslab_group_t *mg)
mutex_exit(&msp->ms_lock);
}

atomic_add_64(&mg->mg_alloc_failures, -failures);

/*
* Prefetch the next potential metaslabs
*/
Expand All @@ -1139,9 +1155,10 @@ metaslab_distance(metaslab_t *msp, dva_t *dva)
}

static uint64_t
metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
uint64_t min_distance, dva_t *dva, int d)
metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags)
{
spa_t *spa = mg->mg_vd->vdev_spa;
metaslab_t *msp = NULL;
uint64_t offset = -1ULL;
avl_tree_t *t = &mg->mg_metaslab_tree;
Expand All @@ -1162,11 +1179,17 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,

mutex_enter(&mg->mg_lock);
for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
if (msp->ms_weight < size) {
if (msp->ms_weight < asize) {
spa_dbgmsg(spa, "%s: failed to meet weight "
"requirement: vdev %llu, txg %llu, mg %p, "
"msp %p, psize %llu, asize %llu, "
"failures %llu, weight %llu",
spa_name(spa), mg->mg_vd->vdev_id, txg,
mg, msp, psize, asize,
mg->mg_alloc_failures, msp->ms_weight);
mutex_exit(&mg->mg_lock);
return (-1ULL);
}

was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
if (activation_weight == METASLAB_WEIGHT_PRIMARY)
break;
Expand All @@ -1185,6 +1208,25 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
if (msp == NULL)
return (-1ULL);

/*
* If we've already reached the allowable number of failed
* allocation attempts on this metaslab group then we
* consider skipping it. We skip it only if we're allowed
* to "fast" gang, the physical size is larger than
* a gang block, and we're attempting to allocate from
* the primary metaslab.
*/
if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
activation_weight == METASLAB_WEIGHT_PRIMARY) {
spa_dbgmsg(spa, "%s: skipping metaslab group: "
"vdev %llu, txg %llu, mg %p, psize %llu, "
"asize %llu, failures %llu", spa_name(spa),
mg->mg_vd->vdev_id, txg, mg, psize, asize,
mg->mg_alloc_failures);
return (-1ULL);
}

mutex_enter(&msp->ms_lock);

/*
Expand All @@ -1193,7 +1235,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
* another thread may have changed the weight while we
* were blocked on the metaslab lock.
*/
if (msp->ms_weight < size || (was_active &&
if (msp->ms_weight < asize || (was_active &&
!(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
activation_weight == METASLAB_WEIGHT_PRIMARY)) {
mutex_exit(&msp->ms_lock);
Expand All @@ -1208,14 +1250,16 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
continue;
}

if (metaslab_activate(msp, activation_weight, size) != 0) {
if (metaslab_activate(msp, activation_weight) != 0) {
mutex_exit(&msp->ms_lock);
continue;
}

if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL)
break;

atomic_inc_64(&mg->mg_alloc_failures);

metaslab_passivate(msp, space_map_maxsize(&msp->ms_map));

mutex_exit(&msp->ms_lock);
Expand All @@ -1224,7 +1268,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);

space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize);

mutex_exit(&msp->ms_lock);

Expand Down Expand Up @@ -1351,7 +1395,8 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
asize = vdev_psize_to_asize(vd, psize);
ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);

offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d);
offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
dva, d, flags);
if (offset != -1ULL) {
/*
* If we've just selected this metaslab group,
Expand All @@ -1363,18 +1408,24 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
vdev_stat_t *vs = &vd->vdev_stat;
int64_t vu, cu;

/*
* Determine percent used in units of 0..1024.
* (This is just to avoid floating point.)
*/
vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
cu = (mc->mc_alloc << 10) / (mc->mc_space + 1);
vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);

/*
* Bias by at most +/- 25% of the aliquot.
* Calculate how much more or less we should
* try to allocate from this device during
* this iteration around the rotor.
* For example, if a device is 80% full
* and the pool is 20% full then we should
* reduce allocations by 60% on this device.
*
* mg_bias = (20 - 80) * 512K / 100 = -307K
*
* This reduces allocations by 307K for this
* iteration.
*/
mg->mg_bias = ((cu - vu) *
(int64_t)mg->mg_aliquot) / (1024 * 4);
(int64_t)mg->mg_aliquot) / 100;
}

if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
Expand Down Expand Up @@ -1488,7 +1539,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
mutex_enter(&msp->ms_lock);

if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded)
error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);

if (error == 0 && !space_map_contains(&msp->ms_map, offset, size))
error = ENOENT;
Expand Down
7 changes: 7 additions & 0 deletions usr/src/uts/common/fs/zfs/spa_misc.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 by Delphix. All rights reserved.
*/

#include <sys/zfs_context.h>
Expand Down Expand Up @@ -1670,3 +1671,9 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)

return (0);
}

boolean_t
spa_debug_enabled(spa_t *spa)
{
return (spa->spa_debug);
}
3 changes: 3 additions & 0 deletions usr/src/uts/common/fs/zfs/sys/metaslab.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 by Delphix. All rights reserved.
*/

#ifndef _SYS_METASLAB_H
Expand Down Expand Up @@ -47,6 +48,8 @@ extern void metaslab_sync_reassess(metaslab_group_t *mg);
#define METASLAB_HINTBP_FAVOR 0x0
#define METASLAB_HINTBP_AVOID 0x1
#define METASLAB_GANG_HEADER 0x2
#define METASLAB_GANG_CHILD 0x4
#define METASLAB_GANG_AVOID 0x8

extern int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp, int flags);
Expand Down
2 changes: 2 additions & 0 deletions usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
* Copyright (c) 2011 by Delphix. All rights reserved.
*/

#ifndef _SYS_METASLAB_IMPL_H
Expand Down Expand Up @@ -52,6 +53,7 @@ struct metaslab_group {
avl_tree_t mg_metaslab_tree;
uint64_t mg_aliquot;
uint64_t mg_bonus_area;
uint64_t mg_alloc_failures;
int64_t mg_bias;
int64_t mg_activation_count;
metaslab_class_t *mg_class;
Expand Down
8 changes: 8 additions & 0 deletions usr/src/uts/common/fs/zfs/sys/spa.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 by Delphix. All rights reserved.
*/

#ifndef _SYS_SPA_H
Expand Down Expand Up @@ -697,6 +698,13 @@ _NOTE(CONSTCOND) } while (0)
#define dprintf_bp(bp, fmt, ...)
#endif

extern boolean_t spa_debug_enabled(spa_t *spa);
#define spa_dbgmsg(spa, ...) \
{ \
if (spa_debug_enabled(spa)) \
zfs_dbgmsg(__VA_ARGS__); \
}

extern int spa_mode_global; /* mode, e.g. FREAD | FWRITE */

#ifdef __cplusplus
Expand Down
2 changes: 2 additions & 0 deletions usr/src/uts/common/fs/zfs/sys/spa_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 by Delphix. All rights reserved.
*/

#ifndef _SYS_SPA_IMPL_H
Expand Down Expand Up @@ -196,6 +197,7 @@ struct spa {
kcondvar_t spa_suspend_cv; /* notification of resume */
uint8_t spa_suspended; /* pool is suspended */
uint8_t spa_claiming; /* pool is doing zil_claim() */
boolean_t spa_debug; /* debug enabled? */
boolean_t spa_is_root; /* pool is root */
int spa_minref; /* num refs when first opened */
int spa_mode; /* FREAD | FWRITE */
Expand Down
22 changes: 21 additions & 1 deletion usr/src/uts/common/fs/zfs/zio.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 by Delphix. All rights reserved.
*/

#include <sys/zfs_context.h>
Expand Down Expand Up @@ -78,6 +79,7 @@ kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
#ifdef _KERNEL
extern vmem_t *zio_alloc_arena;
#endif
extern int zfs_mg_alloc_failures;

/*
* An allocating zio is one that either currently has the DVA allocate
Expand Down Expand Up @@ -158,6 +160,12 @@ zio_init(void)
zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
}

/*
* The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs
* to fail 3 times per txg or 8 failures, whichever is greater.
*/
zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8);

zio_inject_init();
}

Expand Down Expand Up @@ -2114,6 +2122,7 @@ zio_dva_allocate(zio_t *zio)
metaslab_class_t *mc = spa_normal_class(spa);
blkptr_t *bp = zio->io_bp;
int error;
int flags = 0;

if (zio->io_gang_leader == NULL) {
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
Expand All @@ -2126,10 +2135,21 @@ zio_dva_allocate(zio_t *zio)
ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));

/*
* The dump device does not support gang blocks so allocation on
* behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
* the "fast" gang feature.
*/
flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
METASLAB_GANG_CHILD : 0;
error = metaslab_alloc(spa, mc, zio->io_size, bp,
zio->io_prop.zp_copies, zio->io_txg, NULL, 0);
zio->io_prop.zp_copies, zio->io_txg, NULL, flags);

if (error) {
spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
"size %llu, error %d", spa_name(spa), zio, zio->io_size,
error);
if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
return (zio_write_gang_block(zio));
zio->io_error = error;
Expand Down

0 comments on commit 09c9d37

Please sign in to comment.