From 69a0632f28fc4a2cbdebf96ed03c429411273fd2 Mon Sep 17 00:00:00 2001 From: Grady Wong Date: Fri, 21 Sep 2018 15:53:12 +0800 Subject: [PATCH] fix write IO hang. The bug time sequence: 1. context #1, `zfs_write` assign a txg "n". 2. In a same process, context #2, mmap page fault (which means the `mm_sem` is hold) occurred, `zfs_dirty_inode` open a txg failed, and wait previous txg "n" completed. 3. context #1 call `uiomove` to write, however page fault is occurred in `uiomove`, which means it need `mm_sem`, but `mm_sem` is hold by context #2, so it stuck and can't complete, then txg "n" will not complete. So context #1 and context #2 trap into the "dead lock". Signed-off-by: Grady Wong --- include/sys/dmu.h | 3 ++- include/sys/uio_impl.h | 2 +- module/zcommon/zfs_uio.c | 26 ++++++++++++++++++++------ module/zfs/dmu.c | 11 ++++++----- module/zfs/dmu_tx.c | 34 ++++++++++++++++++++++++++++++++++ module/zfs/sa.c | 2 +- module/zfs/zfs_sa.c | 6 +++--- module/zfs/zfs_vnops.c | 35 +++++++++++++++++++++++++++++++---- module/zfs/zvol.c | 3 ++- 9 files changed, 100 insertions(+), 22 deletions(-) diff --git a/include/sys/dmu.h b/include/sys/dmu.h index bc7046fdced8..90fe2a9424e4 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -778,6 +778,7 @@ void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow); void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size); void dmu_tx_abort(dmu_tx_t *tx); int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); +int dmu_tx_reassign(dmu_tx_t *tx, uint64_t txg_how); void dmu_tx_wait(dmu_tx_t *tx); void dmu_tx_commit(dmu_tx_t *tx); void dmu_tx_mark_netfree(dmu_tx_t *tx); @@ -846,7 +847,7 @@ int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size, dmu_tx_t *tx); int dmu_write_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size, - dmu_tx_t *tx); + dmu_tx_t *tx, boolean_t fault_disable); #endif struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); void dmu_return_arcbuf(struct arc_buf *buf); diff --git a/include/sys/uio_impl.h b/include/sys/uio_impl.h index 37e283da0f8b..2cd9ac2f9ba6 100644 --- a/include/sys/uio_impl.h +++ b/include/sys/uio_impl.h @@ -41,7 +41,7 @@ #include -extern int uiomove(void *, size_t, enum uio_rw, uio_t *); +extern int uiomove(void *, size_t, enum uio_rw, uio_t *, boolean_t); extern void uio_prefaultpages(ssize_t, uio_t *); extern int uiocopy(void *, size_t, enum uio_rw, uio_t *, size_t *); extern void uioskip(uio_t *, size_t); diff --git a/module/zcommon/zfs_uio.c b/module/zcommon/zfs_uio.c index af9716126f6d..8a6b1ea36fd5 100644 --- a/module/zcommon/zfs_uio.c +++ b/module/zcommon/zfs_uio.c @@ -52,6 +52,7 @@ #include #include #include +#include /* * Move "n" bytes at byte address "p"; "rw" indicates the direction @@ -60,7 +61,8 @@ * a non-zero errno on failure. */ static int -uiomove_iov(void *p, size_t n, enum uio_rw rw, struct uio *uio) +uiomove_iov(void *p, size_t n, enum uio_rw rw, struct uio *uio, + boolean_t fault_disable) { const struct iovec *iov = uio->uio_iov; size_t skip = uio->uio_skip; @@ -79,8 +81,19 @@ uiomove_iov(void *p, size_t n, enum uio_rw rw, struct uio *uio) if (copy_to_user(iov->iov_base+skip, p, cnt)) return (EFAULT); } else { - if (copy_from_user(p, iov->iov_base+skip, cnt)) - return (EFAULT); + if (fault_disable) { + pagefault_disable(); + if (__copy_from_user_inatomic(p, + iov->iov_base+skip, cnt)) { + pagefault_enable(); + return (EFAULT); + } + pagefault_enable(); + } else { + if (copy_from_user(p, + iov->iov_base+skip, cnt)) + return (EFAULT); + } } break; case UIO_SYSSPACE: @@ -141,10 +154,11 @@ uiomove_bvec(void *p, size_t n, enum uio_rw rw, struct uio *uio) } int -uiomove(void *p, size_t n, enum uio_rw rw, struct uio *uio) +uiomove(void *p, size_t n, enum uio_rw rw, struct uio *uio, + boolean_t fault_disable) { if (uio->uio_segflg != UIO_BVEC) - return (uiomove_iov(p, n, rw, uio)); + return (uiomove_iov(p, n, rw, uio, fault_disable)); else return (uiomove_bvec(p, n, rw, uio)); } @@ -222,7 +236,7 @@ uiocopy(void *p, size_t n, enum uio_rw rw, struct uio *uio, size_t *cbytes) int ret; bcopy(uio, &uio_copy, sizeof (struct uio)); - ret = uiomove(p, n, rw, &uio_copy); + ret = uiomove(p, n, rw, &uio_copy, B_FALSE); *cbytes = uio->uio_resid - uio_copy.uio_resid; return (ret); } diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 8779eb3586ca..ab9cbc643f45 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1424,7 +1424,7 @@ dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) } else #endif err = uiomove((char *)db->db_data + bufoff, tocpy, - UIO_READ, uio); + UIO_READ, uio, B_FALSE); if (err) break; @@ -1488,7 +1488,8 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) } int -dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) +dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx, + boolean_t fault_disable) { dmu_buf_t **dbp; int numbufs; @@ -1524,7 +1525,7 @@ dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) * block. */ err = uiomove((char *)db->db_data + bufoff, tocpy, - UIO_WRITE, uio); + UIO_WRITE, uio, fault_disable); if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); @@ -1561,7 +1562,7 @@ dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, DB_DNODE_ENTER(db); dn = DB_DNODE(db); - err = dmu_write_uio_dnode(dn, uio, size, tx); + err = dmu_write_uio_dnode(dn, uio, size, tx, B_TRUE); DB_DNODE_EXIT(db); return (err); @@ -1586,7 +1587,7 @@ dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, if (err) return (err); - err = dmu_write_uio_dnode(dn, uio, size, tx); + err = dmu_write_uio_dnode(dn, uio, size, tx, B_FALSE); dnode_rele(dn, FTAG); diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index c268f3c40465..9085a2442050 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -1040,6 +1040,40 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) return (0); } +int +dmu_tx_reassign(dmu_tx_t *tx, uint64_t txg_how) +{ + int err; + dmu_tx_hold_t *txh; + + /* + * Walk the transaction's hold list, removing the hold on the + * associated dnode, and notifying waiters if the refcount drops to 0. + */ + for (txh = list_head(&tx->tx_holds); txh != NULL; + txh = list_next(&tx->tx_holds, txh)) { + dnode_t *dn = txh->txh_dnode; + + if (dn == NULL) + continue; + mutex_enter(&dn->dn_mtx); + ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); + + if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { + dn->dn_assigned_txg = 0; + cv_broadcast(&dn->dn_notxholds); + } + mutex_exit(&dn->dn_mtx); + } + + txg_rele_to_sync(&tx->tx_txgh); + + tx->tx_txg = 0; + err = dmu_tx_assign(tx, txg_how); + + return (err); +} + void dmu_tx_wait(dmu_tx_t *tx) { diff --git a/module/zfs/sa.c b/module/zfs/sa.c index caa91bc4c4e1..511368f033c9 100644 --- a/module/zfs/sa.c +++ b/module/zfs/sa.c @@ -1517,7 +1517,7 @@ sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio) mutex_enter(&hdl->sa_lock); if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) { error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size, - uio->uio_resid), UIO_READ, uio); + uio->uio_resid), UIO_READ, uio, B_FALSE); } mutex_exit(&hdl->sa_lock); return (error); diff --git a/module/zfs/zfs_sa.c b/module/zfs/zfs_sa.c index bd21ba896cc3..dc611fc1e67b 100644 --- a/module/zfs/zfs_sa.c +++ b/module/zfs/zfs_sa.c @@ -81,13 +81,13 @@ zfs_sa_readlink(znode_t *zp, uio_t *uio) if (bufsz + ZFS_OLD_ZNODE_PHYS_SIZE <= db->db_size) { error = uiomove((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, - MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); + MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio, B_FALSE); } else { dmu_buf_t *dbp; if ((error = dmu_buf_hold(ZTOZSB(zp)->z_os, zp->z_id, 0, FTAG, &dbp, DMU_READ_NO_PREFETCH)) == 0) { - error = uiomove(dbp->db_data, - MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); + error = uiomove(dbp->db_data, MIN((size_t)bufsz, + uio->uio_resid), UIO_READ, uio, B_FALSE); dmu_buf_rele(dbp, FTAG); } } diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 4e163e2e3fe8..80913f1d2dc4 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -392,7 +392,8 @@ mappedread(struct inode *ip, int nbytes, uio_t *uio) unlock_page(pp); pb = kmap(pp); - error = uiomove(pb + off, bytes, UIO_READ, uio); + error = uiomove(pb + off, bytes, UIO_READ, uio, + B_FALSE); kunmap(pp); if (mapping_writably_mapped(mp)) @@ -808,9 +809,26 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) ssize_t tx_bytes; if (abuf == NULL) { + int reassign_error = 0; tx_bytes = uio->uio_resid; - error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), - uio, nbytes, tx); + do { + error = dmu_write_uio_dbuf( + sa_get_db(zp->z_sa_hdl), uio, nbytes, tx); + if (error == EFAULT) { + uio_prefaultpages(MIN(n, max_blksz), + uio); + reassign_error = dmu_tx_reassign(tx, + TXG_WAIT); + if (reassign_error) { + break; + } + } + } while (error == EFAULT); + + if (reassign_error || error) { + dmu_tx_abort(tx); + break; + } tx_bytes -= uio->uio_resid; } else { tx_bytes = nbytes; @@ -4636,13 +4654,22 @@ zfs_dirty_inode(struct inode *ip, int flags) } #endif +top: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); - error = dmu_tx_assign(tx, TXG_WAIT); + boolean_t waited = B_FALSE; + error = dmu_tx_assign(tx, + waited ? (TXG_NOTHROTTLE | TXG_WAIT) : TXG_NOWAIT); if (error) { + if (error == ERESTART && waited == B_FALSE) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } dmu_tx_abort(tx); goto out; } diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index f7706f14312f..60cd43bbfc5d 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -769,7 +769,8 @@ zvol_write(void *arg) dmu_tx_abort(tx); break; } - error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); + error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx, + B_FALSE); if (error == 0) { zvol_log_write(zv, tx, off, bytes, sync); }