Skip to content

Commit

Permalink
Illumos 4976-4984 - metaslab improvements
Browse files Browse the repository at this point in the history
4976 zfs should only avoid writing to a failing non-redundant top-level vdev
4978 ztest fails in get_metaslab_refcount()
4979 extend free space histogram to device and pool
4980 metaslabs should have a fragmentation metric
4981 remove fragmented ops vector from block allocator
4982 space_map object should proactively upgrade when feature is enabled
4983 need to collect metaslab information via mdb
4984 device selection should use fragmentation metric
Reviewed by: Matthew Ahrens <[email protected]>
Reviewed by: Adam Leventhal <[email protected]>
Reviewed by: Christopher Siden <[email protected]>
Approved by: Garrett D'Amore <[email protected]>

References:
  https://www.illumos.org/issues/4976
  https://www.illumos.org/issues/4978
  https://www.illumos.org/issues/4979
  https://www.illumos.org/issues/4980
  https://www.illumos.org/issues/4981
  https://www.illumos.org/issues/4982
  https://www.illumos.org/issues/4983
  https://www.illumos.org/issues/4984
  illumos/illumos-gate@2e4c998

Notes:
    The "zdb -M" option has been re-tasked to display the new metaslab
    fragmentation metric and the new "zdb -I" option is used to control
    the maximum number of in-flight I/Os.

    The new fragmentation metric is derived from the space map histogram
    which has been rolled up to the vdev and pool level and is presented
    to the user via "zpool list".

    Add a number of module parameters related to the new metaslab weighting
    logic.

Ported by: Tim Chase <[email protected]>
Signed-off-by: Brian Behlendorf <[email protected]>
Closes openzfs#2595
  • Loading branch information
grwilson authored and behlendorf committed Aug 18, 2014
1 parent f67d709 commit f3a7f66
Show file tree
Hide file tree
Showing 18 changed files with 836 additions and 243 deletions.
74 changes: 60 additions & 14 deletions cmd/zdb/zdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -110,11 +110,11 @@ static void
usage(void)
{
(void) fprintf(stderr,
"Usage: %s [-CumdibcsDvhLXFPA] [-t txg] [-e [-p path...]] "
"[-U config] [-M inflight I/Os] poolname [object...]\n"
"Usage: %s [-CumMdibcsDvhLXFPA] [-t txg] [-e [-p path...]] "
"[-U config] [-I inflight I/Os] poolname [object...]\n"
" %s [-divPA] [-e -p path...] [-U config] dataset "
"[object...]\n"
" %s -m [-LXFPA] [-t txg] [-e [-p path...]] [-U config] "
" %s -mM [-LXFPA] [-t txg] [-e [-p path...]] [-U config] "
"poolname [vdev [metaslab...]]\n"
" %s -R [-A] [-e [-p path...]] poolname "
"vdev:offset:size[:flags]\n"
Expand All @@ -137,6 +137,7 @@ usage(void)
(void) fprintf(stderr, " -h pool history\n");
(void) fprintf(stderr, " -b block statistics\n");
(void) fprintf(stderr, " -m metaslabs\n");
(void) fprintf(stderr, " -M metaslab groups\n");
(void) fprintf(stderr, " -c checksum all metadata (twice for "
"all data) blocks\n");
(void) fprintf(stderr, " -s report stats on zdb's I/O\n");
Expand Down Expand Up @@ -165,7 +166,7 @@ usage(void)
(void) fprintf(stderr, " -P print numbers in parseable form\n");
(void) fprintf(stderr, " -t <txg> -- highest txg to use when "
"searching for uberblocks\n");
(void) fprintf(stderr, " -M <number of inflight I/Os> -- "
(void) fprintf(stderr, " -I <number of inflight I/Os> -- "
"specify the maximum number of checksumming I/Os "
"[default is 200]\n");
(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
Expand Down Expand Up @@ -547,7 +548,7 @@ get_metaslab_refcount(vdev_t *vd)
int refcount = 0;
int c, m;

if (vd->vdev_top == vd) {
if (vd->vdev_top == vd && !vd->vdev_removing) {
for (m = 0; m < vd->vdev_ms_count; m++) {
space_map_t *sm = vd->vdev_ms[m]->ms_sm;

Expand Down Expand Up @@ -685,9 +686,10 @@ dump_metaslab(metaslab_t *msp)
* The space map histogram represents free space in chunks
* of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
*/
(void) printf("\tOn-disk histogram:\n");
(void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",
(u_longlong_t)msp->ms_fragmentation);
dump_histogram(sm->sm_phys->smp_histogram,
SPACE_MAP_HISTOGRAM_SIZE(sm), sm->sm_shift);
SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
}

if (dump_opt['d'] > 5 || dump_opt['m'] > 3) {
Expand All @@ -711,6 +713,48 @@ print_vdev_metaslab_header(vdev_t *vd)
"---------------", "-------------");
}

static void
dump_metaslab_groups(spa_t *spa)
{
vdev_t *rvd = spa->spa_root_vdev;
metaslab_class_t *mc = spa_normal_class(spa);
uint64_t fragmentation;
int c;

metaslab_class_histogram_verify(mc);

for (c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
metaslab_group_t *mg = tvd->vdev_mg;

if (mg->mg_class != mc)
continue;

metaslab_group_histogram_verify(mg);
mg->mg_fragmentation = metaslab_group_fragmentation(mg);

(void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t"
"fragmentation",
(u_longlong_t)tvd->vdev_id,
(u_longlong_t)tvd->vdev_ms_count);
if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
(void) printf("%3s\n", "-");
} else {
(void) printf("%3llu%%\n",
(u_longlong_t)mg->mg_fragmentation);
}
dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
}

(void) printf("\tpool %s\tfragmentation", spa_name(spa));
fragmentation = metaslab_class_fragmentation(mc);
if (fragmentation == ZFS_FRAG_INVALID)
(void) printf("\t%3s\n", "-");
else
(void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation);
dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
}

static void
dump_metaslabs(spa_t *spa)
{
Expand Down Expand Up @@ -2381,8 +2425,7 @@ zdb_leak(void *arg, uint64_t start, uint64_t size)
}

static metaslab_ops_t zdb_metaslab_ops = {
NULL, /* alloc */
NULL /* fragmented */
NULL /* alloc */
};

static void
Expand Down Expand Up @@ -2874,6 +2917,8 @@ dump_zpool(spa_t *spa)

if (dump_opt['d'] > 2 || dump_opt['m'])
dump_metaslabs(spa);
if (dump_opt['M'])
dump_metaslab_groups(spa);

if (dump_opt['d'] || dump_opt['i']) {
dump_dir(dp->dp_meta_objset);
Expand Down Expand Up @@ -3363,7 +3408,7 @@ main(int argc, char **argv)
int flags = ZFS_IMPORT_MISSING_LOG;
int rewind = ZPOOL_NEVER_REWIND;
char *spa_config_path_env;
const char *opts = "bcdhilmM:suCDRSAFLVXevp:t:U:P";
const char *opts = "bcdhilmMI:suCDRSAFLXevp:t:U:P";

(void) setrlimit(RLIMIT_NOFILE, &rl);
(void) enable_extended_FILE_stdio(-1, -1);
Expand Down Expand Up @@ -3392,6 +3437,7 @@ main(int argc, char **argv)
case 'u':
case 'C':
case 'D':
case 'M':
case 'R':
case 'S':
dump_opt[c]++;
Expand All @@ -3408,10 +3454,7 @@ main(int argc, char **argv)
case 'V':
flags = ZFS_IMPORT_VERBATIM;
break;
case 'v':
verbose++;
break;
case 'M':
case 'I':
max_inflight = strtoull(optarg, NULL, 0);
if (max_inflight == 0) {
(void) fprintf(stderr, "maximum number "
Expand Down Expand Up @@ -3446,6 +3489,9 @@ main(int argc, char **argv)
case 'U':
spa_config_path = optarg;
break;
case 'v':
verbose++;
break;
default:
usage();
break;
Expand Down
18 changes: 13 additions & 5 deletions cmd/zpool/zpool_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -2998,10 +2998,16 @@ print_one_column(zpool_prop_t prop, uint64_t value, boolean_t scripted)
boolean_t fixed;
size_t width = zprop_width(prop, &fixed, ZFS_TYPE_POOL);

zfs_nicenum(value, propval, sizeof (propval));

if (prop == ZPOOL_PROP_EXPANDSZ && value == 0)
(void) strlcpy(propval, "-", sizeof (propval));
else if (prop == ZPOOL_PROP_FRAGMENTATION && value == ZFS_FRAG_INVALID)
(void) strlcpy(propval, "-", sizeof (propval));
else if (prop == ZPOOL_PROP_FRAGMENTATION)
(void) snprintf(propval, sizeof (propval), "%llu%%",
(unsigned long long)value);
else
zfs_nicenum(value, propval, sizeof (propval));

if (scripted)
(void) printf("\t%s", propval);
Expand Down Expand Up @@ -3034,16 +3040,18 @@ print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
/* only toplevel vdevs have capacity stats */
if (vs->vs_space == 0) {
if (scripted)
(void) printf("\t-\t-\t-");
(void) printf("\t-\t-\t-\t-");
else
(void) printf(" - - -");
(void) printf(" - - - -");
} else {
print_one_column(ZPOOL_PROP_SIZE, vs->vs_space,
scripted);
print_one_column(ZPOOL_PROP_CAPACITY, vs->vs_alloc,
scripted);
print_one_column(ZPOOL_PROP_FREE,
vs->vs_space - vs->vs_alloc, scripted);
print_one_column(ZPOOL_PROP_FRAGMENTATION,
vs->vs_fragmentation, scripted);
}
print_one_column(ZPOOL_PROP_EXPANDSZ, vs->vs_esize,
scripted);
Expand Down Expand Up @@ -3128,8 +3136,8 @@ zpool_do_list(int argc, char **argv)
int ret = 0;
list_cbdata_t cb = { 0 };
static char default_props[] =
"name,size,allocated,free,capacity,dedupratio,"
"health,altroot";
"name,size,allocated,free,fragmentation,capacity,"
"dedupratio,health,altroot";
char *props = default_props;
unsigned long interval = 0, count = 0;
zpool_list_t *list;
Expand Down
9 changes: 9 additions & 0 deletions include/sys/fs/zfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ typedef enum {
ZPOOL_PROP_COMMENT,
ZPOOL_PROP_EXPANDSZ,
ZPOOL_PROP_FREEING,
ZPOOL_PROP_FRAGMENTATION,
ZPOOL_PROP_LEAKED,
ZPOOL_NUM_PROPS
} zpool_prop_t;
Expand Down Expand Up @@ -599,6 +600,13 @@ typedef struct zpool_rewind_policy {
*/
#define SPA_MINDEVSIZE (64ULL << 20)

/*
* Set if the fragmentation has not yet been calculated. This can happen
* because the space maps have not been upgraded or the histogram feature
* is not enabled.
*/
#define ZFS_FRAG_INVALID UINT64_MAX

/*
* The location of the pool configuration repository, shared between kernel and
* userland.
Expand Down Expand Up @@ -747,6 +755,7 @@ typedef struct vdev_stat {
uint64_t vs_self_healed; /* self-healed bytes */
uint64_t vs_scan_removing; /* removing? */
uint64_t vs_scan_processed; /* scan processed bytes */
uint64_t vs_fragmentation; /* device fragmentation */
} vdev_stat_t;

/*
Expand Down
71 changes: 38 additions & 33 deletions include/sys/metaslab.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
*/

#ifndef _SYS_METASLAB_H
Expand All @@ -38,23 +38,22 @@ extern "C" {

typedef struct metaslab_ops {
uint64_t (*msop_alloc)(metaslab_t *msp, uint64_t size);
boolean_t (*msop_fragmented)(metaslab_t *msp);
} metaslab_ops_t;

extern metaslab_ops_t *zfs_metaslab_ops;

metaslab_t *metaslab_init(metaslab_group_t *mg, uint64_t id,
uint64_t object, uint64_t txg);
void metaslab_fini(metaslab_t *msp);
metaslab_t *metaslab_init(metaslab_group_t *, uint64_t,
uint64_t, uint64_t);
void metaslab_fini(metaslab_t *);

void metaslab_load_wait(metaslab_t *msp);
int metaslab_load(metaslab_t *msp);
void metaslab_unload(metaslab_t *msp);
void metaslab_load_wait(metaslab_t *);
int metaslab_load(metaslab_t *);
void metaslab_unload(metaslab_t *);

void metaslab_sync(metaslab_t *msp, uint64_t txg);
void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
void metaslab_sync_reassess(metaslab_group_t *mg);
uint64_t metaslab_block_maxsize(metaslab_t *msp);
void metaslab_sync(metaslab_t *, uint64_t);
void metaslab_sync_done(metaslab_t *, uint64_t);
void metaslab_sync_reassess(metaslab_group_t *);
uint64_t metaslab_block_maxsize(metaslab_t *);

#define METASLAB_HINTBP_FAVOR 0x0
#define METASLAB_HINTBP_AVOID 0x1
Expand All @@ -63,30 +62,36 @@ uint64_t metaslab_block_maxsize(metaslab_t *msp);
#define METASLAB_GANG_AVOID 0x8
#define METASLAB_FASTWRITE 0x10

int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp, int flags);
void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now);
int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
void metaslab_check_free(spa_t *spa, const blkptr_t *bp);
void metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp);
void metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp);
int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
blkptr_t *, int, uint64_t, blkptr_t *, int);
void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
int metaslab_claim(spa_t *, const blkptr_t *, uint64_t);
void metaslab_check_free(spa_t *, const blkptr_t *);
void metaslab_fastwrite_mark(spa_t *, const blkptr_t *);
void metaslab_fastwrite_unmark(spa_t *, const blkptr_t *);

metaslab_class_t *metaslab_class_create(spa_t *spa, metaslab_ops_t *ops);
void metaslab_class_destroy(metaslab_class_t *mc);
int metaslab_class_validate(metaslab_class_t *mc);
metaslab_class_t *metaslab_class_create(spa_t *, metaslab_ops_t *);
void metaslab_class_destroy(metaslab_class_t *);
int metaslab_class_validate(metaslab_class_t *);
void metaslab_class_histogram_verify(metaslab_class_t *);
uint64_t metaslab_class_fragmentation(metaslab_class_t *);
uint64_t metaslab_class_expandable_space(metaslab_class_t *);

void metaslab_class_space_update(metaslab_class_t *mc,
int64_t alloc_delta, int64_t defer_delta,
int64_t space_delta, int64_t dspace_delta);
uint64_t metaslab_class_get_alloc(metaslab_class_t *mc);
uint64_t metaslab_class_get_space(metaslab_class_t *mc);
uint64_t metaslab_class_get_dspace(metaslab_class_t *mc);
uint64_t metaslab_class_get_deferred(metaslab_class_t *mc);
void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t,
int64_t, int64_t);
uint64_t metaslab_class_get_alloc(metaslab_class_t *);
uint64_t metaslab_class_get_space(metaslab_class_t *);
uint64_t metaslab_class_get_dspace(metaslab_class_t *);
uint64_t metaslab_class_get_deferred(metaslab_class_t *);

metaslab_group_t *metaslab_group_create(metaslab_class_t *mc, vdev_t *vd);
void metaslab_group_destroy(metaslab_group_t *mg);
void metaslab_group_activate(metaslab_group_t *mg);
void metaslab_group_passivate(metaslab_group_t *mg);
metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *);
void metaslab_group_destroy(metaslab_group_t *);
void metaslab_group_activate(metaslab_group_t *);
void metaslab_group_passivate(metaslab_group_t *);
uint64_t metaslab_group_get_space(metaslab_group_t *);
void metaslab_group_histogram_verify(metaslab_group_t *);
uint64_t metaslab_group_fragmentation(metaslab_group_t *);
void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);

#ifdef __cplusplus
}
Expand Down
Loading

0 comments on commit f3a7f66

Please sign in to comment.