From 2c8810b1aa63d331274ecbdf8dab882f80dc798b Mon Sep 17 00:00:00 2001
From: Richard Yao <ryao@gentoo.org>
Date: Fri, 12 Feb 2016 16:27:56 -0500
Subject: [PATCH 1/4] Update tc_count via atomic instruction rather than mutex

Using a mutex to protect increment and decrement operations is
inefficient. It uses two atomic instructions, will spin when a single
waiter is present (under Linux with typical configuration options) and
if two waiters occur, will actually deschedule them, which is far slower
than waiting. Using atomic instructions allows hardware to handle the
blocking, which is much faster than software.

Signed-off-by: Richard Yao <ryao@gentoo.org>
---
 module/zfs/txg.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/module/zfs/txg.c b/module/zfs/txg.c
index ed8007e05238..9e320e5a864a 100644
--- a/module/zfs/txg.c
+++ b/module/zfs/txg.c
@@ -310,9 +310,7 @@ txg_hold_open(dsl_pool_t *dp, txg_handle_t *th)
 	mutex_enter(&tc->tc_open_lock);
 	txg = tx->tx_open_txg;
 
-	mutex_enter(&tc->tc_lock);
-	tc->tc_count[txg & TXG_MASK]++;
-	mutex_exit(&tc->tc_lock);
+	atomic_inc_64(&tc->tc_count[txg & TXG_MASK]);
 
 	th->th_cpu = tc;
 	th->th_txg = txg;
@@ -346,11 +344,9 @@ txg_rele_to_sync(txg_handle_t *th)
 	tx_cpu_t *tc = th->th_cpu;
 	int g = th->th_txg & TXG_MASK;
 
-	mutex_enter(&tc->tc_lock);
 	ASSERT(tc->tc_count[g] != 0);
-	if (--tc->tc_count[g] == 0)
+	if (atomic_dec_64_nv(&tc->tc_count[g]) == 0)
 		cv_broadcast(&tc->tc_cv[g]);
-	mutex_exit(&tc->tc_lock);
 
 	th->th_cpu = NULL;	/* defensive */
 }

From df27be61b474e98d0cd1a8ff6161842b42614965 Mon Sep 17 00:00:00 2001
From: Richard Yao <ryao@gentoo.org>
Date: Fri, 12 Feb 2016 17:54:12 -0500
Subject: [PATCH 2/4] Do not hold ->tc_open_lock in DMU transactions.

There does not appear to be anything protected by ->tc_open_lock that
callers use while holding ->tc_open_lock means that anything that
preempts us would block on us when the only things that really need
protection are ->tx_open_txg and ->tc_count.

Signed-off-by: Richard Yao <ryao@gentoo.org>
---
 module/zfs/txg.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/module/zfs/txg.c b/module/zfs/txg.c
index 9e320e5a864a..d7c03b873fc2 100644
--- a/module/zfs/txg.c
+++ b/module/zfs/txg.c
@@ -310,7 +310,17 @@ txg_hold_open(dsl_pool_t *dp, txg_handle_t *th)
 	mutex_enter(&tc->tc_open_lock);
 	txg = tx->tx_open_txg;
 
+	/*
+	 * We decrement ->tc_count inside of the critical section so that the
+	 * zero check in txg_quiesce() does not race with the increment. We use
+	 * an atomic here so that we can minimize potential contention on
+	 * &tc->tc_open_lock in txg_rele_to_sync(). This uses one less atomic
+	 * instruction on both increment and decrement than using &tc->tc_lock.
+	 * Additionally, avoiding a second lock reduces the potential for
+	 * contention to cause us to block while holding the open lock.
+	 */
 	atomic_inc_64(&tc->tc_count[txg & TXG_MASK]);
+	mutex_exit(&tc->tc_open_lock);
 
 	th->th_cpu = tc;
 	th->th_txg = txg;
@@ -318,13 +328,13 @@ txg_hold_open(dsl_pool_t *dp, txg_handle_t *th)
 	return (txg);
 }
 
+/*
+ * This is obsolete, but we keep it around to make porting patches easier and
+ * also so that it can be used as a tracepoint.
+ */
 void
 txg_rele_to_quiesce(txg_handle_t *th)
 {
-	tx_cpu_t *tc = th->th_cpu;
-
-	ASSERT(!MUTEX_HELD(&tc->tc_lock));
-	mutex_exit(&tc->tc_open_lock);
 }
 
 void

From 3ac9d91b61a566307de42dcf998a63899f19791d Mon Sep 17 00:00:00 2001
From: Richard Yao <richard.yao@clusterhq.com>
Date: Wed, 17 Feb 2016 16:16:25 -0500
Subject: [PATCH 3/4] txg visibility code should not execute under tc_open_lock

The memory allocation and locking in `spa_txg_history_*()` can
potentially block txg_hold_open for arbitrarily long periods of time.

Signed-off-by: Richard Yao <ryao@gentoo.org>
---
 module/zfs/txg.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/module/zfs/txg.c b/module/zfs/txg.c
index d7c03b873fc2..ebedccde43c9 100644
--- a/module/zfs/txg.c
+++ b/module/zfs/txg.c
@@ -373,6 +373,7 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
 	tx_state_t *tx = &dp->dp_tx;
 	int g = txg & TXG_MASK;
 	int c;
+	uint64_t tx_open_time;
 
 	/*
 	 * Grab all tc_open_locks so nobody else can get into this txg.
@@ -382,10 +383,7 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
 
 	ASSERT(txg == tx->tx_open_txg);
 	tx->tx_open_txg++;
-	tx->tx_open_time = gethrtime();
-
-	spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_OPEN, tx->tx_open_time);
-	spa_txg_history_add(dp->dp_spa, tx->tx_open_txg, tx->tx_open_time);
+	tx->tx_open_time = tx_open_time = gethrtime();
 
 	DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg);
 	DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg);
@@ -397,6 +395,9 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
 	for (c = 0; c < max_ncpus; c++)
 		mutex_exit(&tx->tx_cpu[c].tc_open_lock);
 
+	spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_OPEN, tx_open_time);
+	spa_txg_history_add(dp->dp_spa, txg + 1, tx_open_time);
+
 	/*
 	 * Quiesce the transaction group by waiting for everyone to txg_exit().
 	 */

From 41595690f47e51a20f612bee442ecee97f94dc9e Mon Sep 17 00:00:00 2001
From: Richard Yao <ryao@gentoo.org>
Date: Fri, 12 Feb 2016 18:53:09 -0500
Subject: [PATCH 4/4] Revert "Wrap smp_processor_id in kpreempt_[dis|en]able"
 and "Fix CPU_SEQID use in preemptible context"

This reverts commit 15a9e03368d8f186751a432740a5a281f45d712d and commit
8878261fc9447592844db5f7eb3df9ed3b088871.

Wrapping CPU_SEQID with kpreempt_disable() and kpreempt_enable() causes
us to call preempt_schedule(). This allows the thread to be rescheduled
on another CPU, allowing the vacated CPU to begin using the per-CPU
structure that we still reference on the CPU we just vacated. This is an
unnecessary source of contention that is a regression from the illumos
behavior. We can turn off the Linux warnings by calling
`raw_smp_processor_id()` instead of wrapping things with
kpreempt_disable() and kpreempt_enable(), so lets do that.

Requires-spl: refs/pull/532/head
Signed-off-by: Richard Yao <ryao@gentoo.org>
---
 include/sys/zfs_context.h |  4 ----
 module/zfs/fm.c           |  8 +-------
 module/zfs/txg.c          | 12 +-----------
 module/zfs/zio.c          |  7 +------
 4 files changed, 3 insertions(+), 28 deletions(-)

diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h
index cc626fdaaae1..9ccf1eee3448 100644
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@@ -67,7 +67,6 @@
 #include <sys/fm/fs/zfs.h>
 #include <sys/sunddi.h>
 #include <sys/ctype.h>
-#include <sys/disp.h>
 #include <sys/trace.h>
 #include <linux/dcache_compat.h>
 #include <linux/utsname_compat.h>
@@ -262,9 +261,6 @@ extern kthread_t *zk_thread_create(caddr_t stk, size_t  stksize,
 	proc_t *pp, int state, pri_t pri, int detachstate);
 extern void zk_thread_join(kt_did_t tid);
 
-#define	kpreempt_disable()	((void)0)
-#define	kpreempt_enable()	((void)0)
-
 #define	PS_NONE		-1
 
 #define	issig(why)	(FALSE)
diff --git a/module/zfs/fm.c b/module/zfs/fm.c
index 4de901ed09c7..56d3981078f0 100644
--- a/module/zfs/fm.c
+++ b/module/zfs/fm.c
@@ -1515,13 +1515,7 @@ fm_ena_generate_cpu(uint64_t timestamp, processorid_t cpuid, uchar_t format)
 uint64_t
 fm_ena_generate(uint64_t timestamp, uchar_t format)
 {
-	uint64_t ena;
-
-	kpreempt_disable();
-	ena = fm_ena_generate_cpu(timestamp, getcpuid(), format);
-	kpreempt_enable();
-
-	return (ena);
+	return (fm_ena_generate_cpu(timestamp, getcpuid(), format));
 }
 
 uint64_t
diff --git a/module/zfs/txg.c b/module/zfs/txg.c
index ebedccde43c9..dd886e3d7270 100644
--- a/module/zfs/txg.c
+++ b/module/zfs/txg.c
@@ -294,19 +294,9 @@ uint64_t
 txg_hold_open(dsl_pool_t *dp, txg_handle_t *th)
 {
 	tx_state_t *tx = &dp->dp_tx;
-	tx_cpu_t *tc;
+	tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID];
 	uint64_t txg;
 
-	/*
-	 * It appears the processor id is simply used as a "random"
-	 * number to index into the array, and there isn't any other
-	 * significance to the chosen tx_cpu. Because.. Why not use
-	 * the current cpu to index into the array?
-	 */
-	kpreempt_disable();
-	tc = &tx->tx_cpu[CPU_SEQID];
-	kpreempt_enable();
-
 	mutex_enter(&tc->tc_open_lock);
 	txg = tx->tx_open_txg;
 
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index 2d16e632de06..4e01f528b0a2 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -1542,19 +1542,14 @@ zio_nowait(zio_t *zio)
 
 	if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
 	    zio_unique_parent(zio) == NULL) {
-		zio_t *pio;
-
 		/*
 		 * This is a logical async I/O with no parent to wait for it.
 		 * We add it to the spa_async_root_zio "Godfather" I/O which
 		 * will ensure they complete prior to unloading the pool.
 		 */
 		spa_t *spa = zio->io_spa;
-		kpreempt_disable();
-		pio = spa->spa_async_zio_root[CPU_SEQID];
-		kpreempt_enable();
 
-		zio_add_child(pio, zio);
+		zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio);
 	}
 
 	__zio_execute(zio);