diff --git a/.nogitrelease b/.nogitrelease new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 8c949e7671a0..bfcc01497dbb 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -707,6 +707,7 @@ typedef struct vdev_stat { uint64_t vs_self_healed; /* self-healed bytes */ uint64_t vs_scan_removing; /* removing? */ uint64_t vs_scan_processed; /* scan processed bytes */ + uint64_t vs_request_time_average;/* avg. request time <<8*/ } vdev_stat_t; /* diff --git a/include/sys/vdev.h b/include/sys/vdev.h index f49086a4776c..49c8fcb6fb02 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -69,6 +69,7 @@ extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, extern boolean_t vdev_dtl_required(vdev_t *vd); extern boolean_t vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp); +extern uint64_t vdev_pending_queued(vdev_t *vd); extern void vdev_hold(vdev_t *); extern void vdev_rele(vdev_t *); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index c44e4f67f068..4698a3d02f1f 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1948,6 +1948,23 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) return (needed); } +uint64_t +vdev_pending_queued(vdev_t *vd) +{ + int pending; + uint64_t estimate; + vdev_queue_t *vq = &vd->vdev_queue; + vdev_stat_t *vs = &vd->vdev_stat; + + mutex_enter(&vq->vq_lock); + pending = avl_numnodes(&vq->vq_pending_tree); + mutex_exit(&vq->vq_lock); + pending++; + estimate = vs->vs_request_time_average >> 8; + estimate = estimate * pending; + return (estimate); +} + void vdev_load(vdev_t *vd) { @@ -2614,7 +2631,9 @@ vdev_stat_update(zio_t *zio, uint64_t psize) vs->vs_ops[type]++; vs->vs_bytes[type] += psize; - + if (zio->io_timestamp > 0) { + vs->vs_request_time_average += ((uint64_t)(ddi_get_lbolt64() - zio->io_timestamp + 1) << 8) - (vs->vs_request_time_average >> 8); + } mutex_exit(&vd->vdev_stat_lock); return; } diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c index a2671ca81a37..3aec95042614 100644 --- a/module/zfs/vdev_mirror.c +++ b/module/zfs/vdev_mirror.c @@ -33,6 +33,8 @@ #include #include +int zfs_vdev_mirror_pending_balance = 0; + /* * Virtual device vector for mirroring. */ @@ -221,7 +223,10 @@ vdev_mirror_child_select(zio_t *zio) mirror_map_t *mm = zio->io_vsd; mirror_child_t *mc; uint64_t txg = zio->io_txg; + int pending_lowest_child = -1; + uint64_t pending_lowest_count = UINT64_MAX; int i, c; + uint64_t pending; ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg); @@ -243,12 +248,35 @@ vdev_mirror_child_select(zio_t *zio) continue; } if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) - return (c); + { + if (!zfs_vdev_mirror_pending_balance) /* balance disabled */ + return (c); + pending = vdev_pending_queued(mc->mc_vd); + if (pending == 0) { + return (c); + } + if (pending < pending_lowest_count) { + pending_lowest_count = pending; + pending_lowest_child = c; + } + else if (pending == pending_lowest_count) { + if ( c == mm->mm_preferred) + pending_lowest_child = c; + } + continue; + } mc->mc_error = ESTALE; mc->mc_skipped = 1; mc->mc_speculative = 1; } + /* + * See if we found multiple devices with pending io's + * and return the child with smallest queue. + */ + if ( pending_lowest_child != -1 ) + return (pending_lowest_child); + /* * Every device is either missing or has this txg in its DTL. * Look for any child we haven't already tried before giving up. @@ -492,3 +520,8 @@ vdev_ops_t vdev_spare_ops = { VDEV_TYPE_SPARE, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ }; + +#if defined(_KERNEL) && defined(HAVE_SPL) +module_param(zfs_vdev_mirror_pending_balance, int, 0644); +MODULE_PARM_DESC(zfs_vdev_mirror_pending_balance, "Balance reads from mirror vdev based on member speed and pending queue depth"); +#endif