diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h index 5f831a1f5604..944d1301beec 100644 --- a/include/sys/metaslab.h +++ b/include/sys/metaslab.h @@ -77,7 +77,7 @@ void metaslab_class_histogram_verify(metaslab_class_t *); uint64_t metaslab_class_fragmentation(metaslab_class_t *); uint64_t metaslab_class_expandable_space(metaslab_class_t *); -void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t, +void metaslab_class_space_update(metaslab_class_t *, int, int64_t, int64_t, int64_t, int64_t); uint64_t metaslab_class_get_alloc(metaslab_class_t *); uint64_t metaslab_class_get_space(metaslab_class_t *); diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h index c61d44ff1cc6..65848f6de071 100644 --- a/include/sys/metaslab_impl.h +++ b/include/sys/metaslab_impl.h @@ -76,10 +76,14 @@ struct metaslab_class { uint64_t mc_aliquotv[METASLAB_CLASS_ROTORS]; int mc_max_nrot; /* highest rotor with member */ uint64_t mc_alloc_groups; /* # of allocatable groups */ - uint64_t mc_alloc; /* total allocated space */ - uint64_t mc_deferred; /* total deferred frees */ - uint64_t mc_space; /* total space (alloc + free) */ - uint64_t mc_dspace; /* total deflated space */ + /* total allocated space */ + uint64_t mc_allocv[METASLAB_CLASS_ROTORS]; + /* total deferred frees */ + uint64_t mc_deferredv[METASLAB_CLASS_ROTORS]; + /* total space (alloc + free) */ + uint64_t mc_spacev[METASLAB_CLASS_ROTORS]; + /* total deflated space */ + uint64_t mc_dspacev[METASLAB_CLASS_ROTORS]; uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE]; }; diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 4f54b1707c54..b4aee4fa174a 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -94,7 +94,7 @@ extern void vdev_propagate_state(vdev_t *vd); extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux); -extern void vdev_space_update(vdev_t *vd, +extern void vdev_space_update(vdev_t *vd, int nrot, int64_t alloc_delta, int64_t defer_delta, int64_t space_delta); extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 82b16ff52e6a..f1aac464dcfb 100755 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -2134,7 +2134,7 @@ arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); - vdev_space_update(dev->l2ad_vdev, + vdev_space_update(dev->l2ad_vdev, -1, -l2hdr->b_asize, 0, 0); (void) refcount_remove_many(&dev->l2ad_alloc, @@ -6153,7 +6153,7 @@ l2arc_write_done(zio_t *zio) kmem_cache_free(hdr_l2only_cache, head); mutex_exit(&dev->l2ad_mtx); - vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); + vdev_space_update(dev->l2ad_vdev, -1, -bytes_dropped, 0, 0); l2arc_do_free_on_write(); @@ -6707,7 +6707,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); ARCSTAT_INCR(arcstat_l2_size, write_sz); ARCSTAT_INCR(arcstat_l2_asize, stats_size); - vdev_space_update(dev->l2ad_vdev, stats_size, 0, 0); + vdev_space_update(dev->l2ad_vdev, -1, stats_size, 0, 0); /* * Bump device hand to the device start if it is approaching the end. @@ -7054,7 +7054,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); - vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); + vdev_space_update(vd, -1, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); refcount_create(&adddev->l2ad_alloc); /* diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 64b96cd6c9c6..40062bcd1cb1 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -33,6 +33,7 @@ #include #include #include +#include #define WITH_DF_BLOCK_ALLOCATOR @@ -218,10 +219,12 @@ metaslab_class_destroy(metaslab_class_t *mc) for (i = 0; i < METASLAB_CLASS_ROTORS; i++) ASSERT(mc->mc_rotorv[i] == NULL); - ASSERT(mc->mc_alloc == 0); - ASSERT(mc->mc_deferred == 0); - ASSERT(mc->mc_space == 0); - ASSERT(mc->mc_dspace == 0); + for (i = 0; i < METASLAB_CLASS_ROTORS; i++) { + ASSERT(mc->mc_allocv[i] == 0); + ASSERT(mc->mc_deferredv[i] == 0); + ASSERT(mc->mc_spacev[i] == 0); + ASSERT(mc->mc_dspacev[i] == 0); + } kmem_free(mc, sizeof (metaslab_class_t)); } @@ -256,37 +259,110 @@ metaslab_class_validate(metaslab_class_t *mc) } void -metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, +metaslab_class_space_update(metaslab_class_t *mc, int nrot, + int64_t alloc_delta, int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) { - atomic_add_64(&mc->mc_alloc, alloc_delta); - atomic_add_64(&mc->mc_deferred, defer_delta); - atomic_add_64(&mc->mc_space, space_delta); - atomic_add_64(&mc->mc_dspace, dspace_delta); + ASSERT(nrot >= 0 && nrot < METASLAB_CLASS_ROTORS); + atomic_add_64(&mc->mc_allocv[nrot], alloc_delta); + atomic_add_64(&mc->mc_deferredv[nrot], defer_delta); + atomic_add_64(&mc->mc_spacev[nrot], space_delta); + atomic_add_64(&mc->mc_dspacev[nrot], dspace_delta); } uint64_t metaslab_class_get_alloc(metaslab_class_t *mc) { - return (mc->mc_alloc); + uint64_t total_alloc = 0; + int i; + + for (i = 0; i < METASLAB_CLASS_ROTORS; i++) + total_alloc += mc->mc_allocv[i]; + + return (total_alloc); } uint64_t metaslab_class_get_deferred(metaslab_class_t *mc) { - return (mc->mc_deferred); + uint64_t total_deferred = 0; + int i; + + for (i = 0; i < METASLAB_CLASS_ROTORS; i++) + total_deferred += mc->mc_deferredv[i]; + + return (total_deferred); } uint64_t metaslab_class_get_space(metaslab_class_t *mc) { - return (mc->mc_space); + uint64_t total_space = 0; + int i; + + for (i = 0; i < METASLAB_CLASS_ROTORS; i++) + total_space += mc->mc_spacev[i]; + + return (total_space); } uint64_t metaslab_class_get_dspace(metaslab_class_t *mc) { - return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); + uint64_t total_dspace_adj = 0; + uint64_t total_dspace = 0; + int i; + uint64_t max_ratio = 1; /* 1 to avoid division by 0 */ + + /* + * When we have a vector of rotors, we (artificially) adjust + * the total dspace returned to reflect the fill fraction of + * the most filled rotor. This since the dspace value + * returned is used to determine if new writes can be made to + * the pool, and we do not want writes to continue if one of + * the vectors has gotten full. + * + * If there is only one component of the vector, we'll return + * the usual value. + */ + + /* Counting in per-mille for the moment... */ + + for (i = 0; i < METASLAB_CLASS_ROTORS; i++) { + uint64_t ratio = + (1000 * mc->mc_allocv[i]) / (mc->mc_spacev[i] + 1); + if (ratio > max_ratio) + max_ratio = ratio; + } + for (i = 0; i < METASLAB_CLASS_ROTORS; i++) { + uint64_t dspace = + (spa_deflate(mc->mc_spa) ? + mc->mc_dspacev[i] : mc->mc_spacev[i]); + uint64_t ratio = + (1000 * mc->mc_allocv[i]) / (mc->mc_spacev[i] + 1); + total_dspace_adj += (dspace * ratio) / max_ratio; + total_dspace += dspace; + } + + /* + * When max_ratio is small (we have a *lot* of free space), + * then the values will fluctuate considerably. But does not + * matter, since what matters is the values when little space + * is free. + * + * However, the value is (luckily) also the value given to the + * user in e.g. df(1), so would be nice to be accurate. Below + * 25 % we return the normal value, and above 75 % the + * adjusted. In between we give a sliding value. + */ + + if (max_ratio < 250) + return (total_dspace); + if (max_ratio > 750) + return (total_dspace_adj); + + return (total_dspace_adj * (max_ratio - 250) + + total_dspace * (750 - max_ratio)) / 500; } void @@ -1354,7 +1430,9 @@ metaslab_fini(metaslab_t *msp) mutex_enter(&msp->ms_lock); VERIFY(msp->ms_group == NULL); - vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm), + ASSERT(mg->mg_nrot != -1); + vdev_space_update(mg->mg_vd, mg->mg_nrot, + -space_map_allocated(msp->ms_sm), 0, -msp->ms_size); space_map_close(msp->ms_sm); @@ -2014,7 +2092,12 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) &msp->ms_lock); } - vdev_space_update(vd, 0, 0, msp->ms_size); + /* Decide which rotor of vector to place in. */ + mg->mg_nrot = 0; + if (!mg->mg_vd->vdev_nonrot) + mg->mg_nrot = 1; + + vdev_space_update(vd, mg->mg_nrot, 0, 0, msp->ms_size); } freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK]; @@ -2024,7 +2107,8 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) defer_delta = range_tree_space(*freed_tree) - range_tree_space(*defer_tree); - vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); + vdev_space_update(vd, mg->mg_nrot, + alloc_delta + defer_delta, defer_delta, 0); ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); @@ -2413,7 +2497,8 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, metaslab_bias_enabled) { vdev_stat_t *vs = &vd->vdev_stat; int64_t vs_free = vs->vs_space - vs->vs_alloc; - int64_t mc_free = mc->mc_space - mc->mc_alloc; + int64_t mc_free = mc->mc_spacev[mg->mg_nrot] - + mc->mc_allocv[mg->mg_nrot]; int64_t ratio; /* diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 75f6e5ce11ae..7a70c01ebcdc 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -3098,7 +3098,8 @@ vdev_stat_update(zio_t *zio, uint64_t psize) * and the root vdev. */ void -vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, +vdev_space_update(vdev_t *vd, int nrot, + int64_t alloc_delta, int64_t defer_delta, int64_t space_delta) { int64_t dspace_delta = space_delta; @@ -3138,7 +3139,7 @@ vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, ASSERT(rvd == vd->vdev_parent); ASSERT(vd->vdev_ms_count != 0); - metaslab_class_space_update(mc, + metaslab_class_space_update(mc, nrot, alloc_delta, defer_delta, space_delta, dspace_delta); } }