From 2c8810b1aa63d331274ecbdf8dab882f80dc798b Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Fri, 12 Feb 2016 16:27:56 -0500 Subject: [PATCH 1/4] Update tc_count via atomic instruction rather than mutex Using a mutex to protect increment and decrement operations is inefficient. It uses two atomic instructions, will spin when a single waiter is present (under Linux with typical configuration options) and if two waiters occur, will actually deschedule them, which is far slower than waiting. Using atomic instructions allows hardware to handle the blocking, which is much faster than software. Signed-off-by: Richard Yao --- module/zfs/txg.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/module/zfs/txg.c b/module/zfs/txg.c index ed8007e05238..9e320e5a864a 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -310,9 +310,7 @@ txg_hold_open(dsl_pool_t *dp, txg_handle_t *th) mutex_enter(&tc->tc_open_lock); txg = tx->tx_open_txg; - mutex_enter(&tc->tc_lock); - tc->tc_count[txg & TXG_MASK]++; - mutex_exit(&tc->tc_lock); + atomic_inc_64(&tc->tc_count[txg & TXG_MASK]); th->th_cpu = tc; th->th_txg = txg; @@ -346,11 +344,9 @@ txg_rele_to_sync(txg_handle_t *th) tx_cpu_t *tc = th->th_cpu; int g = th->th_txg & TXG_MASK; - mutex_enter(&tc->tc_lock); ASSERT(tc->tc_count[g] != 0); - if (--tc->tc_count[g] == 0) + if (atomic_dec_64_nv(&tc->tc_count[g]) == 0) cv_broadcast(&tc->tc_cv[g]); - mutex_exit(&tc->tc_lock); th->th_cpu = NULL; /* defensive */ } From df27be61b474e98d0cd1a8ff6161842b42614965 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Fri, 12 Feb 2016 17:54:12 -0500 Subject: [PATCH 2/4] Do not hold ->tc_open_lock in DMU transactions. There does not appear to be anything protected by ->tc_open_lock that callers use while holding ->tc_open_lock means that anything that preempts us would block on us when the only things that really need protection are ->tx_open_txg and ->tc_count. Signed-off-by: Richard Yao --- module/zfs/txg.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/module/zfs/txg.c b/module/zfs/txg.c index 9e320e5a864a..d7c03b873fc2 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -310,7 +310,17 @@ txg_hold_open(dsl_pool_t *dp, txg_handle_t *th) mutex_enter(&tc->tc_open_lock); txg = tx->tx_open_txg; + /* + * We decrement ->tc_count inside of the critical section so that the + * zero check in txg_quiesce() does not race with the increment. We use + * an atomic here so that we can minimize potential contention on + * &tc->tc_open_lock in txg_rele_to_sync(). This uses one less atomic + * instruction on both increment and decrement than using &tc->tc_lock. + * Additionally, avoiding a second lock reduces the potential for + * contention to cause us to block while holding the open lock. + */ atomic_inc_64(&tc->tc_count[txg & TXG_MASK]); + mutex_exit(&tc->tc_open_lock); th->th_cpu = tc; th->th_txg = txg; @@ -318,13 +328,13 @@ txg_hold_open(dsl_pool_t *dp, txg_handle_t *th) return (txg); } +/* + * This is obsolete, but we keep it around to make porting patches easier and + * also so that it can be used as a tracepoint. + */ void txg_rele_to_quiesce(txg_handle_t *th) { - tx_cpu_t *tc = th->th_cpu; - - ASSERT(!MUTEX_HELD(&tc->tc_lock)); - mutex_exit(&tc->tc_open_lock); } void From 3ac9d91b61a566307de42dcf998a63899f19791d Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Wed, 17 Feb 2016 16:16:25 -0500 Subject: [PATCH 3/4] txg visibility code should not execute under tc_open_lock The memory allocation and locking in `spa_txg_history_*()` can potentially block txg_hold_open for arbitrarily long periods of time. Signed-off-by: Richard Yao --- module/zfs/txg.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/module/zfs/txg.c b/module/zfs/txg.c index d7c03b873fc2..ebedccde43c9 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -373,6 +373,7 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) tx_state_t *tx = &dp->dp_tx; int g = txg & TXG_MASK; int c; + uint64_t tx_open_time; /* * Grab all tc_open_locks so nobody else can get into this txg. @@ -382,10 +383,7 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) ASSERT(txg == tx->tx_open_txg); tx->tx_open_txg++; - tx->tx_open_time = gethrtime(); - - spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_OPEN, tx->tx_open_time); - spa_txg_history_add(dp->dp_spa, tx->tx_open_txg, tx->tx_open_time); + tx->tx_open_time = tx_open_time = gethrtime(); DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg); DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg); @@ -397,6 +395,9 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) for (c = 0; c < max_ncpus; c++) mutex_exit(&tx->tx_cpu[c].tc_open_lock); + spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_OPEN, tx_open_time); + spa_txg_history_add(dp->dp_spa, txg + 1, tx_open_time); + /* * Quiesce the transaction group by waiting for everyone to txg_exit(). */ From 41595690f47e51a20f612bee442ecee97f94dc9e Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Fri, 12 Feb 2016 18:53:09 -0500 Subject: [PATCH 4/4] Revert "Wrap smp_processor_id in kpreempt_[dis|en]able" and "Fix CPU_SEQID use in preemptible context" This reverts commit 15a9e03368d8f186751a432740a5a281f45d712d and commit 8878261fc9447592844db5f7eb3df9ed3b088871. Wrapping CPU_SEQID with kpreempt_disable() and kpreempt_enable() causes us to call preempt_schedule(). This allows the thread to be rescheduled on another CPU, allowing the vacated CPU to begin using the per-CPU structure that we still reference on the CPU we just vacated. This is an unnecessary source of contention that is a regression from the illumos behavior. We can turn off the Linux warnings by calling `raw_smp_processor_id()` instead of wrapping things with kpreempt_disable() and kpreempt_enable(), so lets do that. Requires-spl: refs/pull/532/head Signed-off-by: Richard Yao --- include/sys/zfs_context.h | 4 ---- module/zfs/fm.c | 8 +------- module/zfs/txg.c | 12 +----------- module/zfs/zio.c | 7 +------ 4 files changed, 3 insertions(+), 28 deletions(-) diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index cc626fdaaae1..9ccf1eee3448 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -67,7 +67,6 @@ #include #include #include -#include #include #include #include @@ -262,9 +261,6 @@ extern kthread_t *zk_thread_create(caddr_t stk, size_t stksize, proc_t *pp, int state, pri_t pri, int detachstate); extern void zk_thread_join(kt_did_t tid); -#define kpreempt_disable() ((void)0) -#define kpreempt_enable() ((void)0) - #define PS_NONE -1 #define issig(why) (FALSE) diff --git a/module/zfs/fm.c b/module/zfs/fm.c index 4de901ed09c7..56d3981078f0 100644 --- a/module/zfs/fm.c +++ b/module/zfs/fm.c @@ -1515,13 +1515,7 @@ fm_ena_generate_cpu(uint64_t timestamp, processorid_t cpuid, uchar_t format) uint64_t fm_ena_generate(uint64_t timestamp, uchar_t format) { - uint64_t ena; - - kpreempt_disable(); - ena = fm_ena_generate_cpu(timestamp, getcpuid(), format); - kpreempt_enable(); - - return (ena); + return (fm_ena_generate_cpu(timestamp, getcpuid(), format)); } uint64_t diff --git a/module/zfs/txg.c b/module/zfs/txg.c index ebedccde43c9..dd886e3d7270 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -294,19 +294,9 @@ uint64_t txg_hold_open(dsl_pool_t *dp, txg_handle_t *th) { tx_state_t *tx = &dp->dp_tx; - tx_cpu_t *tc; + tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID]; uint64_t txg; - /* - * It appears the processor id is simply used as a "random" - * number to index into the array, and there isn't any other - * significance to the chosen tx_cpu. Because.. Why not use - * the current cpu to index into the array? - */ - kpreempt_disable(); - tc = &tx->tx_cpu[CPU_SEQID]; - kpreempt_enable(); - mutex_enter(&tc->tc_open_lock); txg = tx->tx_open_txg; diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 2d16e632de06..4e01f528b0a2 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1542,19 +1542,14 @@ zio_nowait(zio_t *zio) if (zio->io_child_type == ZIO_CHILD_LOGICAL && zio_unique_parent(zio) == NULL) { - zio_t *pio; - /* * This is a logical async I/O with no parent to wait for it. * We add it to the spa_async_root_zio "Godfather" I/O which * will ensure they complete prior to unloading the pool. */ spa_t *spa = zio->io_spa; - kpreempt_disable(); - pio = spa->spa_async_zio_root[CPU_SEQID]; - kpreempt_enable(); - zio_add_child(pio, zio); + zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio); } __zio_execute(zio);