Improve N-way mirror performance

The read bandwidth of an N-way mirror can by increased by 50%, and the IOPs by 10%, by more carefully selecting the preferred leaf vdev. The existing algorthm selects a perferred leaf vdev based on offset of the zio request modulo the number of members in the mirror. It assumes the drives are of equal performance and that spreading the requests randomly over both drives will be sufficient to saturate them. In practice this results in the leaf vdevs being under utilized. Utilization can be improved by preferentially selecting the leaf vdev with the least pending IO. This prevents leaf vdevs from being starved and compensates for performance differences between disks in the mirror. Faster vdevs will be sent more work and the mirror performance will not be limitted by the slowest drive. In the common case where all the pending queues are full and there is no single least busy leaf vdev a batching stratagy is employed. Of the N least busy vdevs one is selected with equal probability to be the preferred vdev for T microseconds. Compared to randomly selecting a vdev to break the tie batching the requests greatly improves the odds of merging the requests in the Linux elevator. The testing results show a significant performance improvement for all four workloads tested. The workloads were generated using the fio benchmark and are as follows. 1) 1MB sequential reads from 16 threads to 16 files (MB/s). 2) 4KB sequential reads from 16 threads to 16 files (MB/s). 3) 1MB random reads from 16 threads to 16 files (IOP/s). 4) 4KB random reads from 16 threads to 16 files (IOP/s). | Pristine | With 1461 | | Sequential Random | Sequential Random | | 1MB 4KB 1MB 4KB | 1MB 4KB 1MB 4KB | | MB/s MB/s IO/s IO/s | MB/s MB/s IO/s IO/s | ---------------+-----------------------+------------------------+ 2 Striped | 226 243 11 304 | 222 255 11 299 | 2 2-Way Mirror | 302 324 16 534 | 433 448 23 571 | 2 3-Way Mirror | 429 458 24 714 | 648 648 41 808 | 2 4-Way Mirror | 562 601 36 849 | 816 828 82 926 | Signed-off-by: Brian Behlendorf <[email protected]> Closes #1461
openzfs · Jul 11, 2013 · 556011d · 556011d
1 parent 92334b1
commit 556011d
Showing 1 changed file with 73 additions and 3 deletions.
diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c
@@ -41,6 +41,7 @@ typedef struct mirror_child {
 	vdev_t		*mc_vd;
 	uint64_t	mc_offset;
 	int		mc_error;
+	int		mc_pending;
 	uint8_t		mc_tried;
 	uint8_t		mc_skipped;
 	uint8_t		mc_speculative;
@@ -54,7 +55,23 @@ typedef struct mirror_map {
 	mirror_child_t	mm_child[1];
 } mirror_map_t;
 
-int vdev_mirror_shift = 21;
+/*
+ * When the children are equally busy queue incoming requests to a single
+ * child for N microseconds.  This is done to maximize the likelihood that
+ * the Linux elevator will be able to merge requests while it is plugged.
+ * Otherwise, requests are queued to the least busy device.
+ *
+ * For rotational disks the Linux elevator will plug for 10ms which is
+ * why zfs_vdev_mirror_switch_us is set to 10ms by default.  For non-
+ * rotational disks the elevator will not plug, but 10ms is still a small
+ * enough value that the requests will get spread over all the children.
+ *
+ * For fast SSDs it may make sense to decrease zfs_vdev_mirror_switch_us
+ * significantly to bound the worst case latencies.  It would probably be
+ * ideal to calculate a decaying average of the last observed latencies and
+ * use that to dynamically adjust the zfs_vdev_mirror_switch_us time.
+ */
+int zfs_vdev_mirror_switch_us = 10000;
 
 static void
 vdev_mirror_map_free(zio_t *zio)
@@ -69,6 +86,19 @@ static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
 	zio_vsd_default_cksum_report
 };
 
+static int
+vdev_mirror_pending(vdev_t *vd)
+{
+	vdev_queue_t *vq = &vd->vdev_queue;
+	int pending;
+
+	mutex_enter(&vq->vq_lock);
+	pending = avl_numnodes(&vq->vq_pending_tree);
+	mutex_exit(&vq->vq_lock);
+
+	return (pending);
+}
+
 static mirror_map_t *
 vdev_mirror_map_alloc(zio_t *zio)
 {
@@ -108,20 +138,55 @@ vdev_mirror_map_alloc(zio_t *zio)
 			mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
 		}
 	} else {
+		int lowest_pending = INT_MAX;
+		int lowest_nr = 1;
+
 		c = vd->vdev_children;
 
 		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_PUSHPAGE);
 		mm->mm_children = c;
 		mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops ||
 		    vd->vdev_ops == &vdev_spare_ops);
-		mm->mm_preferred = mm->mm_replacing ? 0 :
-		    (zio->io_offset >> vdev_mirror_shift) % c;
+		mm->mm_preferred = 0;
 		mm->mm_root = B_FALSE;
 
 		for (c = 0; c < mm->mm_children; c++) {
 			mc = &mm->mm_child[c];
 			mc->mc_vd = vd->vdev_child[c];
 			mc->mc_offset = zio->io_offset;
+
+			if (mm->mm_replacing)
+				continue;
+
+			if (!vdev_readable(mc->mc_vd)) {
+				mc->mc_error = ENXIO;
+				mc->mc_tried = 1;
+				mc->mc_skipped = 1;
+				mc->mc_pending = INT_MAX;
+				continue;
+			}
+
+			mc->mc_pending = vdev_mirror_pending(mc->mc_vd);
+			if (mc->mc_pending < lowest_pending) {
+				lowest_pending = mc->mc_pending;
+				lowest_nr = 1;
+			} else if (mc->mc_pending == lowest_pending) {
+				lowest_nr++;
+			}
+		}
+
+		d = gethrtime() / (NSEC_PER_USEC * zfs_vdev_mirror_switch_us);
+		d = (d % lowest_nr) + 1;
+
+		for (c = 0; c < mm->mm_children; c++) {
+			mc = &mm->mm_child[c];
+
+			if (mm->mm_child[c].mc_pending == lowest_pending) {
+				if (--d == 0) {
+					mm->mm_preferred = c;
+					break;
+				}
+			}
 		}
 	}
 
@@ -492,3 +557,8 @@ vdev_ops_t vdev_spare_ops = {
 	VDEV_TYPE_SPARE,	/* name of this vdev type */
 	B_FALSE			/* not a leaf vdev */
 };
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+module_param(zfs_vdev_mirror_switch_us, int, 0644);
+MODULE_PARM_DESC(zfs_vdev_mirror_switch_us, "Switch mirrors every N usecs");
+#endif