From df1be95aa854ea733549ec4c5dcdcc2f56e17e3a Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Mon, 12 Sep 2022 12:55:37 -0400 Subject: [PATCH] Cleanup: Use OpenSolaris functions to call scheduler In our codebase, `cond_resched() and `schedule()` are Linux kernel functions that have replaced the OpenSolaris `kpreempt()` functions in the codebase to such an extent that `kpreempt()` in zfs_context.h was broken. Nobody noticed because we did not actually use it. The header had defined `kpreempt()` as `yield()`, which works on OpenSolaris and Illumos where `sched_yield()` is a wrapper for `yield()`, but that does not work on any other platform. The FreeBSD platform specific code implemented shims for these, but the shim for `schedule()` forced us to wait, which is different than merely rescheduling to another thread as the original Linux code does, while the shim for `cond_resched()` had the same definition as its kernel kpreempt() shim. After studying this, I have concluded that we should reintroduce the kpreempt() function in platform independent code with the following definitions: - In the Linux kernel: kpreempt(unused) -> cond_resched() - In the FreeBSD kernel: kpreempt(unused) -> kern_yield(PRI_USER) - In userspace: kpreempt(unused) -> sched_yield() In userspace, nothing changes from this cleanup. In the kernels, the function `fm_fini()` will now call `kern_yield(PRI_USER)` on FreeBSD and `cond_resched()` on Linux. This is instead of `pause("schedule", 1)` on FreeBSD and `schedule()` on Linux. This makes our behavior consistent across platforms. Note that Linux's SPL continues to use `cond_resched()` and `schedule()`. However, those functions have been removed from both the FreeBSD code and userspace code. This should have the benefit of making it slightly easier to port the code to new platforms by making how things should be mapped less confusing. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Reviewed-by: Neal Gompa Signed-off-by: Richard Yao Closes #13845 --- include/os/freebsd/spl/sys/disp.h | 2 ++ include/os/freebsd/spl/sys/timer.h | 2 -- include/os/freebsd/zfs/sys/zfs_context_os.h | 2 -- include/os/linux/spl/sys/disp.h | 4 +++- include/sys/zfs_context.h | 5 +++-- module/zfs/arc.c | 4 ++-- module/zfs/dnode.c | 6 +++--- module/zfs/fm.c | 2 +- module/zfs/spa_log_spacemap.c | 2 +- 9 files changed, 15 insertions(+), 14 deletions(-) diff --git a/include/os/freebsd/spl/sys/disp.h b/include/os/freebsd/spl/sys/disp.h index 2be1b76e4334..d46a7d2c0143 100644 --- a/include/os/freebsd/spl/sys/disp.h +++ b/include/os/freebsd/spl/sys/disp.h @@ -31,6 +31,8 @@ #include +#define KPREEMPT_SYNC (-1) + #define kpreempt(x) kern_yield(PRI_USER) #endif /* _OPENSOLARIS_SYS_DISP_H_ */ diff --git a/include/os/freebsd/spl/sys/timer.h b/include/os/freebsd/spl/sys/timer.h index d4694bb7c09c..7ff77e9b1b74 100644 --- a/include/os/freebsd/spl/sys/timer.h +++ b/include/os/freebsd/spl/sys/timer.h @@ -33,6 +33,4 @@ #define usleep_range(wakeup, wakeupepsilon) \ pause_sbt("usleep_range", ustosbt(wakeup), \ ustosbt(wakeupepsilon - wakeup), 0) - -#define schedule() pause("schedule", 1) #endif diff --git a/include/os/freebsd/zfs/sys/zfs_context_os.h b/include/os/freebsd/zfs/sys/zfs_context_os.h index 867199501396..1ce72330412c 100644 --- a/include/os/freebsd/zfs/sys/zfs_context_os.h +++ b/include/os/freebsd/zfs/sys/zfs_context_os.h @@ -45,8 +45,6 @@ #define HAVE_LARGE_STACKS 1 #endif -#define cond_resched() kern_yield(PRI_USER) - #define taskq_create_sysdc(a, b, d, e, p, dc, f) \ ((void) sizeof (dc), taskq_create(a, b, maxclsyspri, d, e, f)) diff --git a/include/os/linux/spl/sys/disp.h b/include/os/linux/spl/sys/disp.h index e106d3c5438e..c8be6ffbf10f 100644 --- a/include/os/linux/spl/sys/disp.h +++ b/include/os/linux/spl/sys/disp.h @@ -26,7 +26,9 @@ #include -#define kpreempt(unused) schedule() +#define KPREEMPT_SYNC (-1) + +#define kpreempt(unused) cond_resched() #define kpreempt_disable() preempt_disable() #define kpreempt_enable() preempt_enable() diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index aa4f78789631..83ed97fbec7f 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -219,7 +219,6 @@ typedef pthread_t kthread_t; #define TS_JOINABLE 0x00000004 #define curthread ((void *)(uintptr_t)pthread_self()) -#define kpreempt(x) yield() #define getcomm() "unknown" #define thread_create_named(name, stk, stksize, func, arg, len, \ @@ -248,9 +247,11 @@ extern kthread_t *zk_thread_create(void (*func)(void *), void *arg, #define issig(why) (FALSE) #define ISSIG(thr, why) (FALSE) +#define KPREEMPT_SYNC (-1) + +#define kpreempt(x) sched_yield() #define kpreempt_disable() ((void)0) #define kpreempt_enable() ((void)0) -#define cond_resched() sched_yield() /* * Mutexes diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 980dc60d0cc0..b9969bff534e 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -4165,7 +4165,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, * this CPU are able to make progress, make a voluntary preemption * call here. */ - cond_resched(); + kpreempt(KPREEMPT_SYNC); return (bytes_evicted); } @@ -10335,7 +10335,7 @@ l2arc_rebuild(l2arc_dev_t *dev) !dev->l2ad_first) goto out; - cond_resched(); + kpreempt(KPREEMPT_SYNC); for (;;) { mutex_enter(&l2arc_rebuild_thr_lock); if (dev->l2ad_rebuild_cancel) { diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 67fe1e2c9a0f..ef27dfd40af1 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1142,7 +1142,7 @@ dnode_free_interior_slots(dnode_t *dn) while (!dnode_slots_tryenter(children, idx, slots)) { DNODE_STAT_BUMP(dnode_free_interior_lock_retry); - cond_resched(); + kpreempt(KPREEMPT_SYNC); } dnode_set_slots(children, idx, slots, DN_SLOT_FREE); @@ -1423,7 +1423,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, dnode_slots_rele(dnc, idx, slots); while (!dnode_slots_tryenter(dnc, idx, slots)) { DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry); - cond_resched(); + kpreempt(KPREEMPT_SYNC); } /* @@ -1478,7 +1478,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, dnode_slots_rele(dnc, idx, slots); while (!dnode_slots_tryenter(dnc, idx, slots)) { DNODE_STAT_BUMP(dnode_hold_free_lock_retry); - cond_resched(); + kpreempt(KPREEMPT_SYNC); } if (!dnode_check_slots_free(dnc, idx, slots)) { diff --git a/module/zfs/fm.c b/module/zfs/fm.c index e7a7ad583242..bc13b5517c4e 100644 --- a/module/zfs/fm.c +++ b/module/zfs/fm.c @@ -1354,7 +1354,7 @@ fm_fini(void) zevent_flags |= ZEVENT_SHUTDOWN; while (zevent_waiters > 0) { mutex_exit(&zevent_lock); - schedule(); + kpreempt(KPREEMPT_SYNC); mutex_enter(&zevent_lock); } mutex_exit(&zevent_lock); diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c index 19e334916bd0..4ecce8214f6a 100644 --- a/module/zfs/spa_log_spacemap.c +++ b/module/zfs/spa_log_spacemap.c @@ -1176,7 +1176,7 @@ spa_ld_log_sm_data(spa_t *spa) } /* Load TXG log spacemap into ms_unflushed_allocs/frees. */ - cond_resched(); + kpreempt(KPREEMPT_SYNC); ASSERT0(sls->sls_nblocks); sls->sls_nblocks = space_map_nblocks(sls->sls_sm); spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;