From 724c8eee32abd5bd638ad265498633e6c8f66ddb Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Wed, 1 Mar 2023 16:23:09 -0500 Subject: [PATCH] Fix data race between zil_commit() and zil_suspend() openzfsonwindows/openzfs#206 found that it is possible to trip `VERIFY(list_is_empty(&lwb->lwb_itxs))` when a `zil_commit()` is delayed by the scheduler long enough for a parallel `zil_suspend()` operation to exit `zil_commit_impl()`. This is a data race. To prevent this, we introduce a `zilog->zl_suspend_lock` rwlock to ensure that all outstanding `zil_commit()` operations finish before `zil_suspend()` begins and that subsequent operations fallback to `txg_wait_synced()` after `zil_suspend()` has begun. On `PREEMPT_RT` Linux kernels, the `rw_enter()` implementation suffers from writer starvation. This means that a ZIL intensive system can delay `zil_suspend()` indefinitely. This is a pre-existing problem that affects everything that uses rw locks, so it needs to be addressed in the SPL. However, builds against `PREEMPT_RT` Linux kernels are currently broken due to a GPL symbol issue (#11097), so we can safely disregard that issue for now. Reported-by: Arun KV Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Richard Yao Closes #14514 --- include/sys/zil_impl.h | 1 + module/zfs/zil.c | 27 +++++++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h index d2f4018653a6..cb01be69d7e6 100644 --- a/include/sys/zil_impl.h +++ b/include/sys/zil_impl.h @@ -183,6 +183,7 @@ struct zilog { uint64_t zl_destroy_txg; /* txg of last zil_destroy() */ uint64_t zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */ uint64_t zl_replaying_seq; /* current replay seq number */ + krwlock_t zl_suspend_lock; /* protects suspend count */ uint32_t zl_suspend; /* log suspend count */ kcondvar_t zl_cv_suspend; /* log suspend completion */ uint8_t zl_suspending; /* log is currently suspending */ diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 0f9d196a3df1..658513786b34 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -2941,6 +2941,21 @@ zil_commit(zilog_t *zilog, uint64_t foid) return; } + /* + * The ->zl_suspend_lock rwlock ensures that all in-flight + * zil_commit() operations finish before suspension begins and that + * no more begin. Without it, it is possible for the scheduler to + * preempt us right after the zilog->zl_suspend suspend check, run + * another thread that runs zil_suspend() and after the other thread + * has finished its call to zil_commit_impl(), resume this thread while + * zil is suspended. This can trigger an assertion failure in + * VERIFY(list_is_empty(&lwb->lwb_itxs)). If it is held, it means that + * `zil_suspend()` is executing in another thread, so we go to + * txg_wait_synced(). + */ + if (!rw_tryenter(&zilog->zl_suspend_lock, RW_READER)) + goto wait; + /* * If the ZIL is suspended, we don't want to dirty it by calling * zil_commit_itx_assign() below, nor can we write out @@ -2949,11 +2964,14 @@ zil_commit(zilog_t *zilog, uint64_t foid) * semantics, and avoid calling those functions altogether. */ if (zilog->zl_suspend > 0) { + rw_exit(&zilog->zl_suspend_lock); +wait: txg_wait_synced(zilog->zl_dmu_pool, 0); return; } zil_commit_impl(zilog, foid); + rw_exit(&zilog->zl_suspend_lock); } void @@ -3197,6 +3215,8 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys) cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); + rw_init(&zilog->zl_suspend_lock, NULL, RW_DEFAULT, NULL); + return (zilog); } @@ -3234,6 +3254,8 @@ zil_free(zilog_t *zilog) cv_destroy(&zilog->zl_cv_suspend); + rw_destroy(&zilog->zl_suspend_lock); + kmem_free(zilog, sizeof (zilog_t)); } @@ -3354,11 +3376,14 @@ zil_suspend(const char *osname, void **cookiep) } zilog = dmu_objset_zil(os); + rw_enter(&zilog->zl_suspend_lock, RW_WRITER); + mutex_enter(&zilog->zl_lock); zh = zilog->zl_header; if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ mutex_exit(&zilog->zl_lock); + rw_exit(&zilog->zl_suspend_lock); dmu_objset_rele(os, suspend_tag); return (SET_ERROR(EBUSY)); } @@ -3372,6 +3397,7 @@ zil_suspend(const char *osname, void **cookiep) if (cookiep == NULL && !zilog->zl_suspending && (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) { mutex_exit(&zilog->zl_lock); + rw_exit(&zilog->zl_suspend_lock); dmu_objset_rele(os, suspend_tag); TraceEvent(8, "%s:%d: Returning 0\n", __func__, __LINE__); return (0); @@ -3381,6 +3407,7 @@ zil_suspend(const char *osname, void **cookiep) dsl_pool_rele(dmu_objset_pool(os), suspend_tag); zilog->zl_suspend++; + rw_exit(&zilog->zl_suspend_lock); if (zilog->zl_suspend > 1) { /*