From 921ef063ac4418eae89114c8b36f8cf08adb4546 Mon Sep 17 00:00:00 2001
From: Rob Norris <rob.norris@klarasystems.com>
Date: Thu, 9 May 2024 11:50:58 +1000
Subject: [PATCH] zio_flush: propagate flush errors to the ZIL

Since the beginning, ZFS' "flush" operation has always ignored
errors[1]. Write errors are captured and dealt with, but if a write
succeeds but the subsequent flush fails, the operation as a whole will
appear to succeed[2].

In the end-of-transaction uberblock+label write+flush ceremony, it's
very difficult for this situation to occur. Since all devices are
written to, typically the first write will succeed, the first flush will
fail unobserved, but then the second write will fail, and the entire
transaction is aborted. It's difficult to imagine a real-world scenario
where all the writes in that sequence could succeed even as the flushes
are failing (understanding that the OS is still seeing hardware problems
and taking devices offline).

In the ZIL however, it's another story. Since only the write response is
checked, if that write succeeds but the flush then fails, the ZIL will
believe that it succeeds, and zil_commit() (and thus fsync()) will
return success rather than the "correct" behaviour of falling back into
txg_wait_synced()[3].

This commit fixes this by adding a simple flag to zio_flush() to
indicate whether or not the caller wants to receive flush errors. This
flag is enabled for ZIL calls. The existing zio chaining inside the ZIL
and the flush handler zil_lwb_flush_vdevs_done() already has all the
necessary support to properly handle a flush failure and fail the entire
zio chain. This causes zil_commit() to correct fall back to
txg_wait_synced() rather than returning success prematurely.

1. The ZFS birth commit (illumos/illumos-gate@fa9e4066f0) had support
   for flushing devices with write caches with the DKIOCFLUSHWRITECACHE
   ioctl. No errors are checked. The comment in `zil_flush_vdevs()` from
   from the time shows the thinking:

   /*
    * Wait for all the flushes to complete.  Not all devices actually
    * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails.
    */

2. It's not entirely clear from the code history why this was acceptable
   for devices that _do_ have write caches. Our best guess is that this
   was an oversight: between the combination of hardware, pool topology
   and application behaviour required to hit this, it basically didn't
   come up.

3. Somewhat frustratingly, zil.c contains comments describing this exact
   behaviour, and further discussion in #12443 (September 2021). It
   appears that those involved saw the potential, but were looking at a
   different problem and so didn't have the context to recognise it for
   what it was.

Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
---
 include/sys/zio.h       |  2 +-
 module/zfs/vdev_label.c | 12 +++++++-----
 module/zfs/vdev_raidz.c |  6 +++---
 module/zfs/zil.c        | 19 ++-----------------
 module/zfs/zio.c        |  8 ++++----
 5 files changed, 17 insertions(+), 30 deletions(-)

diff --git a/include/sys/zio.h b/include/sys/zio.h
index 77c70b9b481c..c5a9fb6c8824 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -594,7 +594,7 @@ extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
 
 extern int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg,
     blkptr_t *new_bp, uint64_t size, boolean_t *slog);
-extern void zio_flush(zio_t *zio, vdev_t *vd);
+extern void zio_flush(zio_t *zio, vdev_t *vd, boolean_t propagate);
 extern void zio_shrink(zio_t *zio, uint64_t size);
 
 extern int zio_wait(zio_t *zio);
diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c
index ed592514fded..6f61d57b749d 100644
--- a/module/zfs/vdev_label.c
+++ b/module/zfs/vdev_label.c
@@ -1831,19 +1831,21 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
 
 	for (int v = 0; v < svdcount; v++) {
 		if (vdev_writeable(svd[v])) {
-			zio_flush(zio, svd[v]);
+			zio_flush(zio, svd[v], B_FALSE);
 		}
 	}
 	if (spa->spa_aux_sync_uber) {
 		spa->spa_aux_sync_uber = B_FALSE;
 		for (int v = 0; v < spa->spa_spares.sav_count; v++) {
 			if (vdev_writeable(spa->spa_spares.sav_vdevs[v])) {
-				zio_flush(zio, spa->spa_spares.sav_vdevs[v]);
+				zio_flush(zio, spa->spa_spares.sav_vdevs[v],
+				    B_FALSE);
 			}
 		}
 		for (int v = 0; v < spa->spa_l2cache.sav_count; v++) {
 			if (vdev_writeable(spa->spa_l2cache.sav_vdevs[v])) {
-				zio_flush(zio, spa->spa_l2cache.sav_vdevs[v]);
+				zio_flush(zio, spa->spa_l2cache.sav_vdevs[v],
+				    B_FALSE);
 			}
 		}
 	}
@@ -1981,7 +1983,7 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
 	zio = zio_root(spa, NULL, NULL, flags);
 
 	for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd))
-		zio_flush(zio, vd);
+		zio_flush(zio, vd, B_FALSE);
 
 	(void) zio_wait(zio);
 
@@ -2056,7 +2058,7 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
 	for (vdev_t *vd =
 	    txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd != NULL;
 	    vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)))
-		zio_flush(zio, vd);
+		zio_flush(zio, vd, B_FALSE);
 
 	(void) zio_wait(zio);
 
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index 15c8b8ca6016..187d3908ff50 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -4172,7 +4172,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
 		goto io_error_exit;
 	}
 	pio = zio_root(spa, NULL, NULL, 0);
-	zio_flush(pio, raidvd);
+	zio_flush(pio, raidvd, B_FALSE);
 	zio_wait(pio);
 
 	zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area",
@@ -4231,7 +4231,7 @@ raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
 		goto io_error_exit;
 	}
 	pio = zio_root(spa, NULL, NULL, 0);
-	zio_flush(pio, raidvd);
+	zio_flush(pio, raidvd, B_FALSE);
 	zio_wait(pio);
 
 	zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location",
@@ -4339,7 +4339,7 @@ vdev_raidz_reflow_copy_scratch(spa_t *spa)
 	}
 	zio_wait(pio);
 	pio = zio_root(spa, NULL, NULL, 0);
-	zio_flush(pio, raidvd);
+	zio_flush(pio, raidvd, B_FALSE);
 	zio_wait(pio);
 
 	zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) "
diff --git a/module/zfs/zil.c b/module/zfs/zil.c
index 34be54b337fd..fb53200378d3 100644
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -1463,12 +1463,6 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
 		 * includes ZIO errors from either this LWB's write or
 		 * flush, as well as any errors from other dependent LWBs
 		 * (e.g. a root LWB ZIO that might be a child of this LWB).
-		 *
-		 * With that said, it's important to note that LWB flush
-		 * errors are not propagated up to the LWB root ZIO.
-		 * This is incorrect behavior, and results in VDEV flush
-		 * errors not being handled correctly here. See the
-		 * comment above the call to "zio_flush" for details.
 		 */
 
 		zcw->zcw_zio_error = zio->io_error;
@@ -1618,17 +1612,8 @@ zil_lwb_write_done(zio_t *zio)
 
 	while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
 		vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
-		if (vd != NULL) {
-			/*
-			 * The "ZIO_FLAG_DONT_PROPAGATE" is currently
-			 * always used within "zio_flush". This means,
-			 * any errors when flushing the vdev(s), will
-			 * (unfortunately) not be handled correctly,
-			 * since these "zio_flush" errors will not be
-			 * propagated up to "zil_lwb_flush_vdevs_done".
-			 */
-			zio_flush(lwb->lwb_root_zio, vd);
-		}
+		if (vd != NULL)
+			zio_flush(lwb->lwb_root_zio, vd, B_TRUE);
 		kmem_free(zv, sizeof (*zv));
 	}
 }
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index d68d5ababe79..278ff220e392 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -1624,10 +1624,10 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
  * the flushes complete.
  */
 void
-zio_flush(zio_t *pio, vdev_t *vd)
+zio_flush(zio_t *pio, vdev_t *vd, boolean_t propagate)
 {
-	const zio_flag_t flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
-	    ZIO_FLAG_DONT_RETRY;
+	const zio_flag_t flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY |
+	    (propagate ? 0 : ZIO_FLAG_DONT_PROPAGATE);
 
 	if (vd->vdev_nowritecache)
 		return;
@@ -1638,7 +1638,7 @@ zio_flush(zio_t *pio, vdev_t *vd)
 		    NULL, ZIO_STAGE_OPEN, ZIO_FLUSH_PIPELINE));
 	} else {
 		for (uint64_t c = 0; c < vd->vdev_children; c++)
-			zio_flush(pio, vd->vdev_child[c]);
+			zio_flush(pio, vd->vdev_child[c], propagate);
 	}
 }