From 20ad617df6abaea9f1f4ddf97633764fa1fec04d Mon Sep 17 00:00:00 2001
From: Brian Behlendorf <behlendorf1@llnl.gov>
Date: Fri, 31 May 2013 12:07:59 -0700
Subject: [PATCH] Improve N-way mirror performance

The read bandwidth of an N-way mirror can by increased by 50%,
and the IOPs by 10%, by more carefully selecting the preferred
leaf vdev.

The existing algorthm selects a perferred leaf vdev based on
offset of the zio request modulo the number of members in the
mirror.  It assumes the drives are of equal performance and
that spreading the requests randomly over both drives will be
sufficient to saturate them.  In practice this results in the
leaf vdevs being under utilized.

Utilization can be improved by preferentially selecting the leaf
vdev with the least pending IO.  This prevents leaf vdevs from
being starved and compensates for performance differences between
disks in the mirror.  Faster vdevs will be sent more work and
the mirror performance will not be limitted by the slowest drive.

In the common case where all the pending queues are full and there
is no single least busy leaf vdev a batching stratagy is employed.
Of the N least busy vdevs one is selected with equal probability
to be the preferred vdev for T milliseconds.  Compared to randomly
selecting a vdev to break the tie batching the requests greatly
improves the odds of merging the requests in the Linux elevator.

The testing results show a significant performance improvement
for all four workloads tested.  The workloads were generated
using the fio benchmark and are as follows.

1) 1MB sequential reads from 16 threads to 16 files (MB/s).
2) 4KB sequential reads from 16 threads to 16 files (MB/s).
3) 1MB random reads from 16 threads to 16 files (IOP/s).
4) 4KB random reads from 16 threads to 16 files (IOP/s).

               | Pristine              |  With 1461             |
               | Sequential  Random    |  Sequential  Random    |
               | 1MB  4KB    1MB  4KB  |  1MB  4KB    1MB  4KB  |
               | MB/s MB/s   IO/s IO/s |  MB/s MB/s   IO/s IO/s |
---------------+-----------------------+------------------------+
2 Striped      | 226  243     11  304  |  222  255     11  299  |
2 2-Way Mirror | 302  324     16  534  |  433  448     23  571  |
2 3-Way Mirror | 429  458     24  714  |  648  648     41  808  |
2 4-Way Mirror | 562  601     36  849  |  816  828     82  926  |

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #1461
---
 lib/libspl/include/sys/time.h |  4 ++
 module/zfs/vdev_mirror.c      | 70 +++++++++++++++++++++++++++++++++--
 2 files changed, 71 insertions(+), 3 deletions(-)

diff --git a/lib/libspl/include/sys/time.h b/lib/libspl/include/sys/time.h
index 0cbbd928dbce..681f5e72cc98 100644
--- a/lib/libspl/include/sys/time.h
+++ b/lib/libspl/include/sys/time.h
@@ -46,6 +46,10 @@
 #define NANOSEC		1000000000
 #endif
 
+#ifndef NSEC_PER_MSEC
+#define NSEC_PER_MSEC	1000000L
+#endif
+
 #ifndef NSEC_PER_USEC
 #define NSEC_PER_USEC	1000L
 #endif
diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c
index a2671ca81a37..b775ec9217d3 100644
--- a/module/zfs/vdev_mirror.c
+++ b/module/zfs/vdev_mirror.c
@@ -41,6 +41,7 @@ typedef struct mirror_child {
 	vdev_t		*mc_vd;
 	uint64_t	mc_offset;
 	int		mc_error;
+	int		mc_pending;
 	uint8_t		mc_tried;
 	uint8_t		mc_skipped;
 	uint8_t		mc_speculative;
@@ -54,7 +55,17 @@ typedef struct mirror_map {
 	mirror_child_t	mm_child[1];
 } mirror_map_t;
 
-int vdev_mirror_shift = 21;
+/*
+ * When the children are equally busy queue incoming request to a single
+ * child for N milliseconds.  This is done to maximum the likelihood that
+ * the Linux elevator will be able to merge requests while it is plugged.
+ *
+ * For rotational disks the Linux elevator will plug for 10ms which is
+ * why zfs_vdev_mirror_switch_ms is set to 10ms by default.  For non-
+ * rotational disks the elevator will not plug but 10ms is still a small
+ * enough value the requests will get spread over all the children.
+ */
+int zfs_vdev_mirror_switch_ms = 10;
 
 static void
 vdev_mirror_map_free(zio_t *zio)
@@ -69,6 +80,19 @@ static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
 	zio_vsd_default_cksum_report
 };
 
+static int
+vdev_mirror_pending(vdev_t *vd)
+{
+	vdev_queue_t *vq = &vd->vdev_queue;
+	int pending;
+
+	mutex_enter(&vq->vq_lock);
+	pending = avl_numnodes(&vq->vq_pending_tree);
+	mutex_exit(&vq->vq_lock);
+
+	return (pending);
+}
+
 static mirror_map_t *
 vdev_mirror_map_alloc(zio_t *zio)
 {
@@ -108,20 +132,55 @@ vdev_mirror_map_alloc(zio_t *zio)
 			mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
 		}
 	} else {
+		int lowest_pending = INT_MAX;
+		int lowest_nr = 0;
+
 		c = vd->vdev_children;
 
 		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_PUSHPAGE);
 		mm->mm_children = c;
 		mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops ||
 		    vd->vdev_ops == &vdev_spare_ops);
-		mm->mm_preferred = mm->mm_replacing ? 0 :
-		    (zio->io_offset >> vdev_mirror_shift) % c;
+		mm->mm_preferred = 0;
 		mm->mm_root = B_FALSE;
 
 		for (c = 0; c < mm->mm_children; c++) {
 			mc = &mm->mm_child[c];
 			mc->mc_vd = vd->vdev_child[c];
 			mc->mc_offset = zio->io_offset;
+
+			if (mm->mm_replacing)
+				continue;
+
+			if (!vdev_readable(mc->mc_vd)) {
+				mc->mc_error = ENXIO;
+				mc->mc_tried = 1;
+				mc->mc_skipped = 1;
+				mc->mc_pending = INT_MAX;
+				continue;
+			}
+
+			mc->mc_pending = vdev_mirror_pending(mc->mc_vd);
+			if (mc->mc_pending < lowest_pending) {
+				lowest_pending = mc->mc_pending;
+				lowest_nr = 1;
+			} else if (mc->mc_pending == lowest_pending) {
+				lowest_nr++;
+			}
+		}
+
+		d = gethrtime() / (NSEC_PER_MSEC * zfs_vdev_mirror_switch_ms);
+		d = (d % lowest_nr) + 1;
+
+		for (c = 0; c < mm->mm_children; c++) {
+			mc = &mm->mm_child[c];
+
+			if (mm->mm_child[c].mc_pending == lowest_pending) {
+				if (--d == 0) {
+					mm->mm_preferred = c;
+					break;
+				}
+			}
 		}
 	}
 
@@ -492,3 +551,8 @@ vdev_ops_t vdev_spare_ops = {
 	VDEV_TYPE_SPARE,	/* name of this vdev type */
 	B_FALSE			/* not a leaf vdev */
 };
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+module_param(zfs_vdev_mirror_switch_ms, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_mirror_switch_ms, "Switch mirrors every N msecs");
+#endif