From d3e1def4d8e43db169fd6f03863f3ba5a44ebcd5 Mon Sep 17 00:00:00 2001 From: Arun KV Date: Wed, 28 Jul 2021 19:41:42 +0530 Subject: [PATCH 1/6] Fixed data integrity issue when underlying disk returns error to zfs zil_lwb_write_done error was not propagated to zil_lwb_flush_vdevs_done, due to which zil_commit_impl was returning and application gets write success even though zfs was not able to write data to the disk. Signed-off-by: Arun KV --- module/zfs/zil.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 2eeb4fa4fe42..8045a09d33aa 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -1179,7 +1179,8 @@ zil_lwb_flush_vdevs_done(zio_t *zio) ASSERT3P(zcw->zcw_lwb, ==, lwb); zcw->zcw_lwb = NULL; - zcw->zcw_zio_error = zio->io_error; + if (zio->io_error != 0) + zcw->zcw_zio_error = zio->io_error; ASSERT3B(zcw->zcw_done, ==, B_FALSE); zcw->zcw_done = B_TRUE; @@ -1253,6 +1254,24 @@ zil_lwb_write_done(zio_t *zio) * written out. */ if (zio->io_error != 0) { + /* + * Copy the write error to zcw, becaues the zil_lwb_write_done + * error is not propagated to zil_lwb_flush_vdevs_done, which + * will cause zil_commit_impl to return without committing + * the data. + * Refer https://github.com/openzfs/zfs/issues/12391 + * for more details. + */ + zil_commit_waiter_t *zcw; + for (zcw = list_head(&lwb->lwb_waiters); zcw != NULL; + zcw = list_next(&lwb->lwb_waiters, zcw)) { + mutex_enter(&zcw->zcw_lock); + ASSERT(list_link_active(&zcw->zcw_node)); + ASSERT3P(zcw->zcw_lwb, ==, lwb); + zcw->zcw_zio_error = zio->io_error; + mutex_exit(&zcw->zcw_lock); + } + while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) kmem_free(zv, sizeof (*zv)); return; From 50cf0195080d571c9ff40038974d44d88c56ddb1 Mon Sep 17 00:00:00 2001 From: Arun KV Date: Fri, 13 Aug 2021 13:07:30 +0530 Subject: [PATCH 2/6] Incorporated all review comments. --- module/zfs/zil.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 8045a09d33aa..502136308cc3 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -1179,8 +1179,21 @@ zil_lwb_flush_vdevs_done(zio_t *zio) ASSERT3P(zcw->zcw_lwb, ==, lwb); zcw->zcw_lwb = NULL; - if (zio->io_error != 0) + /* + * Overwrite zcw_zio_error only if there is an error + * in flush, otherwise propagate the zcw_zio_error + * that is already set during the zil_lwb_write_done. + * Refer https://github.com/openzfs/zfs/issues/12391 + * for more details. + */ + if (zio->io_error != 0) { + /* + * If the flush has failed, then the write must have + * been successful. VERIFY the same. + */ + VERIFY(zcw->zcw_zio_error == 0); zcw->zcw_zio_error = zio->io_error; + } ASSERT3B(zcw->zcw_done, ==, B_FALSE); zcw->zcw_done = B_TRUE; @@ -1254,14 +1267,6 @@ zil_lwb_write_done(zio_t *zio) * written out. */ if (zio->io_error != 0) { - /* - * Copy the write error to zcw, becaues the zil_lwb_write_done - * error is not propagated to zil_lwb_flush_vdevs_done, which - * will cause zil_commit_impl to return without committing - * the data. - * Refer https://github.com/openzfs/zfs/issues/12391 - * for more details. - */ zil_commit_waiter_t *zcw; for (zcw = list_head(&lwb->lwb_waiters); zcw != NULL; zcw = list_next(&lwb->lwb_waiters, zcw)) { From 6256570c5bb3037fa6fa802e9889bbf0a65bc67d Mon Sep 17 00:00:00 2001 From: Arun KV Date: Fri, 13 Aug 2021 13:07:30 +0530 Subject: [PATCH 3/6] Incorporated all review comments. Signed-off-by: Arun KV --- module/zfs/zil.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 8045a09d33aa..502136308cc3 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -1179,8 +1179,21 @@ zil_lwb_flush_vdevs_done(zio_t *zio) ASSERT3P(zcw->zcw_lwb, ==, lwb); zcw->zcw_lwb = NULL; - if (zio->io_error != 0) + /* + * Overwrite zcw_zio_error only if there is an error + * in flush, otherwise propagate the zcw_zio_error + * that is already set during the zil_lwb_write_done. + * Refer https://github.com/openzfs/zfs/issues/12391 + * for more details. + */ + if (zio->io_error != 0) { + /* + * If the flush has failed, then the write must have + * been successful. VERIFY the same. + */ + VERIFY(zcw->zcw_zio_error == 0); zcw->zcw_zio_error = zio->io_error; + } ASSERT3B(zcw->zcw_done, ==, B_FALSE); zcw->zcw_done = B_TRUE; @@ -1254,14 +1267,6 @@ zil_lwb_write_done(zio_t *zio) * written out. */ if (zio->io_error != 0) { - /* - * Copy the write error to zcw, becaues the zil_lwb_write_done - * error is not propagated to zil_lwb_flush_vdevs_done, which - * will cause zil_commit_impl to return without committing - * the data. - * Refer https://github.com/openzfs/zfs/issues/12391 - * for more details. - */ zil_commit_waiter_t *zcw; for (zcw = list_head(&lwb->lwb_waiters); zcw != NULL; zcw = list_next(&lwb->lwb_waiters, zcw)) { From 5bbbcd0043dc6106dd823fb9340d3384f8cd80e9 Mon Sep 17 00:00:00 2001 From: Arun KV Date: Mon, 16 Aug 2021 09:21:11 +0530 Subject: [PATCH 4/6] Added assert zcw_zio_error is zero-initialized Signed-off-by: Arun KV --- module/zfs/zil.c | 1 + 1 file changed, 1 insertion(+) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 502136308cc3..553f2476d61a 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -1273,6 +1273,7 @@ zil_lwb_write_done(zio_t *zio) mutex_enter(&zcw->zcw_lock); ASSERT(list_link_active(&zcw->zcw_node)); ASSERT3P(zcw->zcw_lwb, ==, lwb); + ASSERT3S(zcw->zcw_zio_error, ==, 0); zcw->zcw_zio_error = zio->io_error; mutex_exit(&zcw->zcw_lock); } From bad4ca31ef8896cb27c2b2a5820c9eafd38ed2c3 Mon Sep 17 00:00:00 2001 From: Arun KV Date: Wed, 18 Aug 2021 18:19:59 +0530 Subject: [PATCH 5/6] Removing the ZIO_FLAG_DONT_PROPAGATE from zio_rewrite since it consolidates the error handling for flush and write errors to a single location (rather than having error handling split between the "write done" and "flush done" handlers). Signed-off-by: Arun KV --- module/zfs/zil.c | 30 ++---------------------------- 1 file changed, 2 insertions(+), 28 deletions(-) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 553f2476d61a..252c1a8abb20 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -1179,21 +1179,7 @@ zil_lwb_flush_vdevs_done(zio_t *zio) ASSERT3P(zcw->zcw_lwb, ==, lwb); zcw->zcw_lwb = NULL; - /* - * Overwrite zcw_zio_error only if there is an error - * in flush, otherwise propagate the zcw_zio_error - * that is already set during the zil_lwb_write_done. - * Refer https://github.com/openzfs/zfs/issues/12391 - * for more details. - */ - if (zio->io_error != 0) { - /* - * If the flush has failed, then the write must have - * been successful. VERIFY the same. - */ - VERIFY(zcw->zcw_zio_error == 0); - zcw->zcw_zio_error = zio->io_error; - } + zcw->zcw_zio_error = zio->io_error; ASSERT3B(zcw->zcw_done, ==, B_FALSE); zcw->zcw_done = B_TRUE; @@ -1267,17 +1253,6 @@ zil_lwb_write_done(zio_t *zio) * written out. */ if (zio->io_error != 0) { - zil_commit_waiter_t *zcw; - for (zcw = list_head(&lwb->lwb_waiters); zcw != NULL; - zcw = list_next(&lwb->lwb_waiters, zcw)) { - mutex_enter(&zcw->zcw_lock); - ASSERT(list_link_active(&zcw->zcw_node)); - ASSERT3P(zcw->zcw_lwb, ==, lwb); - ASSERT3S(zcw->zcw_zio_error, ==, 0); - zcw->zcw_zio_error = zio->io_error; - mutex_exit(&zcw->zcw_lock); - } - while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) kmem_free(zv, sizeof (*zv)); return; @@ -1424,8 +1399,7 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb, - prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_FASTWRITE, &zb); + prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb); ASSERT3P(lwb->lwb_write_zio, !=, NULL); lwb->lwb_state = LWB_STATE_OPENED; From 5183aab40b0cf2896c5904c51e3d0003f0545a0d Mon Sep 17 00:00:00 2001 From: Arun KV Date: Tue, 31 Aug 2021 17:40:53 +0530 Subject: [PATCH 6/6] Incorporate all the suggested comments Signed-off-by: Arun KV --- module/zfs/zil.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 252c1a8abb20..640e805d093a 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -1178,6 +1178,20 @@ zil_lwb_flush_vdevs_done(zio_t *zio) ASSERT3P(zcw->zcw_lwb, ==, lwb); zcw->zcw_lwb = NULL; + /* + * We expect any ZIO errors from child ZIOs to have been + * propagated "up" to this specific LWB's root ZIO, in + * order for this error handling to work correctly. This + * includes ZIO errors from either this LWB's write or + * flush, as well as any errors from other dependent LWBs + * (e.g. a root LWB ZIO that might be a child of this LWB). + * + * With that said, it's important to note that LWB flush + * errors are not propagated up to the LWB root ZIO. + * This is incorrect behavior, and results in VDEV flush + * errors not being handled correctly here. See the + * comment above the call to "zio_flush" for details. + */ zcw->zcw_zio_error = zio->io_error; @@ -1251,6 +1265,12 @@ zil_lwb_write_done(zio_t *zio) * nodes. We avoid calling zio_flush() since there isn't any * good reason for doing so, after the lwb block failed to be * written out. + * + * Additionally, we don't perform any further error handling at + * this point (e.g. setting "zcw_zio_error" appropriately), as + * we expect that to occur in "zil_lwb_flush_vdevs_done" (thus, + * we expect any error seen here, to have been propagated to + * that function). */ if (zio->io_error != 0) { while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) @@ -1281,8 +1301,17 @@ zil_lwb_write_done(zio_t *zio) while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); - if (vd != NULL) + if (vd != NULL) { + /* + * The "ZIO_FLAG_DONT_PROPAGATE" is currently + * always used within "zio_flush". This means, + * any errors when flushing the vdev(s), will + * (unfortunately) not be handled correctly, + * since these "zio_flush" errors will not be + * propagated up to "zil_lwb_flush_vdevs_done". + */ zio_flush(lwb->lwb_root_zio, vd); + } kmem_free(zv, sizeof (*zv)); } }