From cd06b1bcac4ef6fd4fd5d17d9f22b0da6869fc24 Mon Sep 17 00:00:00 2001 From: Grady Wong <grady.w@xtaotech.com> Date: Mon, 8 Oct 2018 10:32:40 +0800 Subject: [PATCH] fix write IO hang. The bug time sequence: 1. context #1, `zfs_write` assign a txg "n". 2. In a same process, context #2, mmap page fault (which means the `mm_sem` is hold) occurred, `zfs_dirty_inode` open a txg failed, and wait previous txg "n" completed. 3. context #1 call `uiomove` to write, however page fault is occurred in `uiomove`, which means it need `mm_sem`, but `mm_sem` is hold by context #2, so it stuck and can't complete, then txg "n" will not complete. So context #1 and context #2 trap into the "dead lock". Signed-off-by: Grady Wong <grady.w@xtaotech.com> --- include/spl/sys/uio.h | 1 + module/zcommon/zfs_uio.c | 21 +++++++++++++++++++-- module/zfs/zfs_vnops.c | 20 +++++++++++++++++++- 3 files changed, 39 insertions(+), 3 deletions(-) diff --git a/include/spl/sys/uio.h b/include/spl/sys/uio.h index 64c452b8d17f..fac26079d7bc 100644 --- a/include/spl/sys/uio.h +++ b/include/spl/sys/uio.h @@ -53,6 +53,7 @@ typedef struct uio { int uio_iovcnt; offset_t uio_loffset; uio_seg_t uio_segflg; + boolean_t uio_fault_disable; uint16_t uio_fmode; uint16_t uio_extflg; offset_t uio_limit; diff --git a/module/zcommon/zfs_uio.c b/module/zcommon/zfs_uio.c index af9716126f6d..9b7eceaf81e0 100644 --- a/module/zcommon/zfs_uio.c +++ b/module/zcommon/zfs_uio.c @@ -52,6 +52,7 @@ #include <sys/sysmacros.h> #include <sys/strings.h> #include <linux/kmap_compat.h> +#include <linux/uaccess.h> /* * Move "n" bytes at byte address "p"; "rw" indicates the direction @@ -79,8 +80,24 @@ uiomove_iov(void *p, size_t n, enum uio_rw rw, struct uio *uio) if (copy_to_user(iov->iov_base+skip, p, cnt)) return (EFAULT); } else { - if (copy_from_user(p, iov->iov_base+skip, cnt)) - return (EFAULT); + if (uio->uio_fault_disable) { + if (!access_ok(VERIFY_READ, + (iov->iov_base + skip), cnt)) { + return (SET_ERROR(EFAULT)); + } + + pagefault_disable(); + if (__copy_from_user_inatomic(p, + (iov->iov_base + skip), cnt)) { + pagefault_enable(); + return (SET_ERROR(EFAULT)); + } + pagefault_enable(); + } else { + if (copy_from_user(p, + (iov->iov_base + skip), cnt)) + return (SET_ERROR(EFAULT)); + } } break; case UIO_SYSSPACE: diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 4e163e2e3fe8..76c8e2510cf9 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -809,8 +809,17 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) ssize_t tx_bytes; if (abuf == NULL) { tx_bytes = uio->uio_resid; + uio->uio_fault_disable = B_TRUE; error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes, tx); + if (error == EFAULT) { + dmu_tx_commit(tx); + uio_prefaultpages(MIN(n, max_blksz), uio); + continue; + } else if (error != 0) { + dmu_tx_abort(tx); + break; + } tx_bytes -= uio->uio_resid; } else { tx_bytes = nbytes; @@ -4636,13 +4645,22 @@ zfs_dirty_inode(struct inode *ip, int flags) } #endif +top: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); - error = dmu_tx_assign(tx, TXG_WAIT); + boolean_t waited = B_FALSE; + error = dmu_tx_assign(tx, + waited ? (TXG_NOTHROTTLE | TXG_WAIT) : TXG_NOWAIT); if (error) { + if (error == ERESTART && waited == B_FALSE) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } dmu_tx_abort(tx); goto out; }