From cd06b1bcac4ef6fd4fd5d17d9f22b0da6869fc24 Mon Sep 17 00:00:00 2001
From: Grady Wong <grady.w@xtaotech.com>
Date: Mon, 8 Oct 2018 10:32:40 +0800
Subject: [PATCH] fix write IO hang.

The bug time sequence:
1. context #1, `zfs_write` assign a txg "n".
2. In a same process, context #2, mmap page fault (which means the
   `mm_sem` is hold) occurred, `zfs_dirty_inode` open a txg failed,
   and wait previous txg "n" completed.
3. context #1 call `uiomove` to write, however page fault is occurred
   in `uiomove`, which means it need `mm_sem`, but `mm_sem` is hold by
   context #2, so it stuck and can't complete,  then txg "n" will
   not complete.

So context #1 and context #2 trap into the "dead lock".

Signed-off-by: Grady Wong <grady.w@xtaotech.com>
---
 include/spl/sys/uio.h    |  1 +
 module/zcommon/zfs_uio.c | 21 +++++++++++++++++++--
 module/zfs/zfs_vnops.c   | 20 +++++++++++++++++++-
 3 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/include/spl/sys/uio.h b/include/spl/sys/uio.h
index 64c452b8d17f..fac26079d7bc 100644
--- a/include/spl/sys/uio.h
+++ b/include/spl/sys/uio.h
@@ -53,6 +53,7 @@ typedef struct uio {
 	int		uio_iovcnt;
 	offset_t	uio_loffset;
 	uio_seg_t	uio_segflg;
+	boolean_t	uio_fault_disable;
 	uint16_t	uio_fmode;
 	uint16_t	uio_extflg;
 	offset_t	uio_limit;
diff --git a/module/zcommon/zfs_uio.c b/module/zcommon/zfs_uio.c
index af9716126f6d..9b7eceaf81e0 100644
--- a/module/zcommon/zfs_uio.c
+++ b/module/zcommon/zfs_uio.c
@@ -52,6 +52,7 @@
 #include <sys/sysmacros.h>
 #include <sys/strings.h>
 #include <linux/kmap_compat.h>
+#include <linux/uaccess.h>
 
 /*
  * Move "n" bytes at byte address "p"; "rw" indicates the direction
@@ -79,8 +80,24 @@ uiomove_iov(void *p, size_t n, enum uio_rw rw, struct uio *uio)
 				if (copy_to_user(iov->iov_base+skip, p, cnt))
 					return (EFAULT);
 			} else {
-				if (copy_from_user(p, iov->iov_base+skip, cnt))
-					return (EFAULT);
+				if (uio->uio_fault_disable) {
+					if (!access_ok(VERIFY_READ,
+					    (iov->iov_base + skip), cnt)) {
+						return (SET_ERROR(EFAULT));
+					}
+
+					pagefault_disable();
+					if (__copy_from_user_inatomic(p,
+					    (iov->iov_base + skip), cnt)) {
+						pagefault_enable();
+						return (SET_ERROR(EFAULT));
+					}
+					pagefault_enable();
+				} else {
+					if (copy_from_user(p,
+					    (iov->iov_base + skip), cnt))
+						return (SET_ERROR(EFAULT));
+				}
 			}
 			break;
 		case UIO_SYSSPACE:
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c
index 4e163e2e3fe8..76c8e2510cf9 100644
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -809,8 +809,17 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)
 		ssize_t tx_bytes;
 		if (abuf == NULL) {
 			tx_bytes = uio->uio_resid;
+			uio->uio_fault_disable = B_TRUE;
 			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 			    uio, nbytes, tx);
+			if (error == EFAULT) {
+				dmu_tx_commit(tx);
+				uio_prefaultpages(MIN(n, max_blksz), uio);
+				continue;
+			} else if (error != 0) {
+				dmu_tx_abort(tx);
+				break;
+			}
 			tx_bytes -= uio->uio_resid;
 		} else {
 			tx_bytes = nbytes;
@@ -4636,13 +4645,22 @@ zfs_dirty_inode(struct inode *ip, int flags)
 	}
 #endif
 
+top:
 	tx = dmu_tx_create(zfsvfs->z_os);
 
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 
-	error = dmu_tx_assign(tx, TXG_WAIT);
+	boolean_t waited = B_FALSE;
+	error = dmu_tx_assign(tx,
+	    waited ? (TXG_NOTHROTTLE | TXG_WAIT) : TXG_NOWAIT);
 	if (error) {
+		if (error == ERESTART && waited == B_FALSE) {
+			waited = B_TRUE;
+			dmu_tx_wait(tx);
+			dmu_tx_abort(tx);
+			goto top;
+		}
 		dmu_tx_abort(tx);
 		goto out;
 	}