Skip to content

Commit

Permalink
Fix data race between zil_commit() and zil_suspend()
Browse files Browse the repository at this point in the history
openzfsonwindows#206 found that it is possible to trip
`VERIFY(list_is_empty(&lwb->lwb_itxs))` when a `zil_commit()` is delayed
by the scheduler long enough for a parallel `zil_suspend()` operation to
exit `zil_commit_impl()`. This is a data race. To prevent this, we
introduce a `zilog->zl_suspend_lock` rwlock to ensure that all
outstanding `zil_commit()` operations finish before `zil_suspend()`
begins and that subsequent operations fallback to `txg_wait_synced()`
after `zil_suspend()` has begun.

On `PREEMPT_RT` Linux kernels, the `rw_enter()` implementation suffers
from writer starvation. This means that a ZIL intensive system can delay
`zil_suspend()` indefinitely. This is a pre-existing problem that
affects everything that uses rw locks, so it needs to be addressed in
the SPL.  However, builds against `PREEMPT_RT` Linux kernels are
currently broken due to a GPL symbol issue (openzfs#11097), so we can safely
disregard that issue for now.

Reported-by: Arun KV <[email protected]>
Signed-off-by: Richard Yao <[email protected]>
  • Loading branch information
ryao committed Mar 1, 2023
1 parent 4fe9cc5 commit 81b8c86
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 0 deletions.
1 change: 1 addition & 0 deletions include/sys/zil_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ struct zilog {
uint64_t zl_destroy_txg; /* txg of last zil_destroy() */
uint64_t zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */
uint64_t zl_replaying_seq; /* current replay seq number */
krwlock_t zl_suspend_lock; /* protects suspend count */
uint32_t zl_suspend; /* log suspend count */
kcondvar_t zl_cv_suspend; /* log suspend completion */
uint8_t zl_suspending; /* log is currently suspending */
Expand Down
27 changes: 27 additions & 0 deletions module/zfs/zil.c
Original file line number Diff line number Diff line change
Expand Up @@ -3195,6 +3195,21 @@ zil_commit(zilog_t *zilog, uint64_t foid)
return;
}

/*
* The ->zl_suspend_lock rwlock ensures that all in-flight
* zil_commit() operations finish before suspension begins and that
* no more begin. Without it, it is possible for the scheduler to
* preempt us right after the zilog->zl_suspend suspend check, run
* another thread that runs zil_suspend() and after the other thread
* has finished its call to zil_commit_impl(), resume this thread while
* zil is suspended. This can trigger an assertion failure in
* VERIFY(list_is_empty(&lwb->lwb_itxs)). If it is held, it means that
* `zil_suspend()` is executing in another thread, so we go to
* txg_wait_synced().
*/
if (!rw_tryenter(&zilog->zl_suspend_lock, RW_READER))
goto wait;

/*
* If the ZIL is suspended, we don't want to dirty it by calling
* zil_commit_itx_assign() below, nor can we write out
Expand All @@ -3203,11 +3218,14 @@ zil_commit(zilog_t *zilog, uint64_t foid)
* semantics, and avoid calling those functions altogether.
*/
if (zilog->zl_suspend > 0) {
rw_exit(&zilog->zl_suspend_lock);
wait:
txg_wait_synced(zilog->zl_dmu_pool, 0);
return;
}

zil_commit_impl(zilog, foid);
rw_exit(&zilog->zl_suspend_lock);
}

void
Expand Down Expand Up @@ -3472,6 +3490,8 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
cv_init(&zilog->zl_lwb_io_cv, NULL, CV_DEFAULT, NULL);

rw_init(&zilog->zl_suspend_lock, NULL, RW_DEFAULT, NULL);

return (zilog);
}

Expand Down Expand Up @@ -3511,6 +3531,8 @@ zil_free(zilog_t *zilog)
cv_destroy(&zilog->zl_cv_suspend);
cv_destroy(&zilog->zl_lwb_io_cv);

rw_destroy(&zilog->zl_suspend_lock);

kmem_free(zilog, sizeof (zilog_t));
}

Expand Down Expand Up @@ -3638,11 +3660,14 @@ zil_suspend(const char *osname, void **cookiep)
return (error);
zilog = dmu_objset_zil(os);

rw_enter(&zilog->zl_suspend_lock, RW_WRITER);

mutex_enter(&zilog->zl_lock);
zh = zilog->zl_header;

if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */
mutex_exit(&zilog->zl_lock);
rw_exit(&zilog->zl_suspend_lock);
dmu_objset_rele(os, suspend_tag);
return (SET_ERROR(EBUSY));
}
Expand All @@ -3656,6 +3681,7 @@ zil_suspend(const char *osname, void **cookiep)
if (cookiep == NULL && !zilog->zl_suspending &&
(zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) {
mutex_exit(&zilog->zl_lock);
rw_exit(&zilog->zl_suspend_lock);
dmu_objset_rele(os, suspend_tag);
return (0);
}
Expand All @@ -3664,6 +3690,7 @@ zil_suspend(const char *osname, void **cookiep)
dsl_pool_rele(dmu_objset_pool(os), suspend_tag);

zilog->zl_suspend++;
rw_exit(&zilog->zl_suspend_lock);

if (zilog->zl_suspend > 1) {
/*
Expand Down

0 comments on commit 81b8c86

Please sign in to comment.