Skip to content

Commit

Permalink
fix write IO hang.
Browse files Browse the repository at this point in the history
The bug time sequence:
1. context #1, `zfs_write` assign a txg "n".
2. In a same process, context #2, mmap page fault (which means the
   `mm_sem` is hold) occurred, `zfs_dirty_inode` open a txg failed,
   and wait previous txg "n" completed.
3. context #1 call `uiomove` to write, however page fault is occurred
   in `uiomove`, which means it need `mm_sem`, but `mm_sem` is hold by
   context #2, so it stuck and can't complete,  then txg "n" will
   not complete.

So context #1 and context #2 trap into the "dead lock".

Signed-off-by: Grady Wong <[email protected]>
  • Loading branch information
Grady Wong committed Sep 29, 2018
1 parent 7522a26 commit 69a0632
Show file tree
Hide file tree
Showing 9 changed files with 100 additions and 22 deletions.
3 changes: 2 additions & 1 deletion include/sys/dmu.h
Original file line number Diff line number Diff line change
Expand Up @@ -778,6 +778,7 @@ void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow);
void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
void dmu_tx_abort(dmu_tx_t *tx);
int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
int dmu_tx_reassign(dmu_tx_t *tx, uint64_t txg_how);
void dmu_tx_wait(dmu_tx_t *tx);
void dmu_tx_commit(dmu_tx_t *tx);
void dmu_tx_mark_netfree(dmu_tx_t *tx);
Expand Down Expand Up @@ -846,7 +847,7 @@ int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size,
dmu_tx_t *tx);
int dmu_write_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size,
dmu_tx_t *tx);
dmu_tx_t *tx, boolean_t fault_disable);
#endif
struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
void dmu_return_arcbuf(struct arc_buf *buf);
Expand Down
2 changes: 1 addition & 1 deletion include/sys/uio_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@

#include <sys/uio.h>

extern int uiomove(void *, size_t, enum uio_rw, uio_t *);
extern int uiomove(void *, size_t, enum uio_rw, uio_t *, boolean_t);
extern void uio_prefaultpages(ssize_t, uio_t *);
extern int uiocopy(void *, size_t, enum uio_rw, uio_t *, size_t *);
extern void uioskip(uio_t *, size_t);
Expand Down
26 changes: 20 additions & 6 deletions module/zcommon/zfs_uio.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
#include <sys/sysmacros.h>
#include <sys/strings.h>
#include <linux/kmap_compat.h>
#include <linux/uaccess.h>

/*
* Move "n" bytes at byte address "p"; "rw" indicates the direction
Expand All @@ -60,7 +61,8 @@
* a non-zero errno on failure.
*/
static int
uiomove_iov(void *p, size_t n, enum uio_rw rw, struct uio *uio)
uiomove_iov(void *p, size_t n, enum uio_rw rw, struct uio *uio,
boolean_t fault_disable)
{
const struct iovec *iov = uio->uio_iov;
size_t skip = uio->uio_skip;
Expand All @@ -79,8 +81,19 @@ uiomove_iov(void *p, size_t n, enum uio_rw rw, struct uio *uio)
if (copy_to_user(iov->iov_base+skip, p, cnt))
return (EFAULT);
} else {
if (copy_from_user(p, iov->iov_base+skip, cnt))
return (EFAULT);
if (fault_disable) {
pagefault_disable();
if (__copy_from_user_inatomic(p,
iov->iov_base+skip, cnt)) {
pagefault_enable();
return (EFAULT);
}
pagefault_enable();
} else {
if (copy_from_user(p,
iov->iov_base+skip, cnt))
return (EFAULT);
}
}
break;
case UIO_SYSSPACE:
Expand Down Expand Up @@ -141,10 +154,11 @@ uiomove_bvec(void *p, size_t n, enum uio_rw rw, struct uio *uio)
}

int
uiomove(void *p, size_t n, enum uio_rw rw, struct uio *uio)
uiomove(void *p, size_t n, enum uio_rw rw, struct uio *uio,
boolean_t fault_disable)
{
if (uio->uio_segflg != UIO_BVEC)
return (uiomove_iov(p, n, rw, uio));
return (uiomove_iov(p, n, rw, uio, fault_disable));
else
return (uiomove_bvec(p, n, rw, uio));
}
Expand Down Expand Up @@ -222,7 +236,7 @@ uiocopy(void *p, size_t n, enum uio_rw rw, struct uio *uio, size_t *cbytes)
int ret;

bcopy(uio, &uio_copy, sizeof (struct uio));
ret = uiomove(p, n, rw, &uio_copy);
ret = uiomove(p, n, rw, &uio_copy, B_FALSE);
*cbytes = uio->uio_resid - uio_copy.uio_resid;
return (ret);
}
Expand Down
11 changes: 6 additions & 5 deletions module/zfs/dmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -1424,7 +1424,7 @@ dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size)
} else
#endif
err = uiomove((char *)db->db_data + bufoff, tocpy,
UIO_READ, uio);
UIO_READ, uio, B_FALSE);
if (err)
break;

Expand Down Expand Up @@ -1488,7 +1488,8 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
}

int
dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx,
boolean_t fault_disable)
{
dmu_buf_t **dbp;
int numbufs;
Expand Down Expand Up @@ -1524,7 +1525,7 @@ dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
* block.
*/
err = uiomove((char *)db->db_data + bufoff, tocpy,
UIO_WRITE, uio);
UIO_WRITE, uio, fault_disable);

if (tocpy == db->db_size)
dmu_buf_fill_done(db, tx);
Expand Down Expand Up @@ -1561,7 +1562,7 @@ dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,

DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
err = dmu_write_uio_dnode(dn, uio, size, tx);
err = dmu_write_uio_dnode(dn, uio, size, tx, B_TRUE);
DB_DNODE_EXIT(db);

return (err);
Expand All @@ -1586,7 +1587,7 @@ dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
if (err)
return (err);

err = dmu_write_uio_dnode(dn, uio, size, tx);
err = dmu_write_uio_dnode(dn, uio, size, tx, B_FALSE);

dnode_rele(dn, FTAG);

Expand Down
34 changes: 34 additions & 0 deletions module/zfs/dmu_tx.c
Original file line number Diff line number Diff line change
Expand Up @@ -1040,6 +1040,40 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
return (0);
}

int
dmu_tx_reassign(dmu_tx_t *tx, uint64_t txg_how)
{
int err;
dmu_tx_hold_t *txh;

/*
* Walk the transaction's hold list, removing the hold on the
* associated dnode, and notifying waiters if the refcount drops to 0.
*/
for (txh = list_head(&tx->tx_holds); txh != NULL;
txh = list_next(&tx->tx_holds, txh)) {
dnode_t *dn = txh->txh_dnode;

if (dn == NULL)
continue;
mutex_enter(&dn->dn_mtx);
ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);

if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
dn->dn_assigned_txg = 0;
cv_broadcast(&dn->dn_notxholds);
}
mutex_exit(&dn->dn_mtx);
}

txg_rele_to_sync(&tx->tx_txgh);

tx->tx_txg = 0;
err = dmu_tx_assign(tx, txg_how);

return (err);
}

void
dmu_tx_wait(dmu_tx_t *tx)
{
Expand Down
2 changes: 1 addition & 1 deletion module/zfs/sa.c
Original file line number Diff line number Diff line change
Expand Up @@ -1517,7 +1517,7 @@ sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio)
mutex_enter(&hdl->sa_lock);
if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) {
error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size,
uio->uio_resid), UIO_READ, uio);
uio->uio_resid), UIO_READ, uio, B_FALSE);
}
mutex_exit(&hdl->sa_lock);
return (error);
Expand Down
6 changes: 3 additions & 3 deletions module/zfs/zfs_sa.c
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,13 @@ zfs_sa_readlink(znode_t *zp, uio_t *uio)
if (bufsz + ZFS_OLD_ZNODE_PHYS_SIZE <= db->db_size) {
error = uiomove((caddr_t)db->db_data +
ZFS_OLD_ZNODE_PHYS_SIZE,
MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio, B_FALSE);
} else {
dmu_buf_t *dbp;
if ((error = dmu_buf_hold(ZTOZSB(zp)->z_os, zp->z_id,
0, FTAG, &dbp, DMU_READ_NO_PREFETCH)) == 0) {
error = uiomove(dbp->db_data,
MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
error = uiomove(dbp->db_data, MIN((size_t)bufsz,
uio->uio_resid), UIO_READ, uio, B_FALSE);
dmu_buf_rele(dbp, FTAG);
}
}
Expand Down
35 changes: 31 additions & 4 deletions module/zfs/zfs_vnops.c
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,8 @@ mappedread(struct inode *ip, int nbytes, uio_t *uio)
unlock_page(pp);

pb = kmap(pp);
error = uiomove(pb + off, bytes, UIO_READ, uio);
error = uiomove(pb + off, bytes, UIO_READ, uio,
B_FALSE);
kunmap(pp);

if (mapping_writably_mapped(mp))
Expand Down Expand Up @@ -808,9 +809,26 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr)

ssize_t tx_bytes;
if (abuf == NULL) {
int reassign_error = 0;
tx_bytes = uio->uio_resid;
error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
uio, nbytes, tx);
do {
error = dmu_write_uio_dbuf(
sa_get_db(zp->z_sa_hdl), uio, nbytes, tx);
if (error == EFAULT) {
uio_prefaultpages(MIN(n, max_blksz),
uio);
reassign_error = dmu_tx_reassign(tx,
TXG_WAIT);
if (reassign_error) {
break;
}
}
} while (error == EFAULT);

if (reassign_error || error) {
dmu_tx_abort(tx);
break;
}
tx_bytes -= uio->uio_resid;
} else {
tx_bytes = nbytes;
Expand Down Expand Up @@ -4636,13 +4654,22 @@ zfs_dirty_inode(struct inode *ip, int flags)
}
#endif

top:
tx = dmu_tx_create(zfsvfs->z_os);

dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
zfs_sa_upgrade_txholds(tx, zp);

error = dmu_tx_assign(tx, TXG_WAIT);
boolean_t waited = B_FALSE;
error = dmu_tx_assign(tx,
waited ? (TXG_NOTHROTTLE | TXG_WAIT) : TXG_NOWAIT);
if (error) {
if (error == ERESTART && waited == B_FALSE) {
waited = B_TRUE;
dmu_tx_wait(tx);
dmu_tx_abort(tx);
goto top;
}
dmu_tx_abort(tx);
goto out;
}
Expand Down
3 changes: 2 additions & 1 deletion module/zfs/zvol.c
Original file line number Diff line number Diff line change
Expand Up @@ -769,7 +769,8 @@ zvol_write(void *arg)
dmu_tx_abort(tx);
break;
}
error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx,
B_FALSE);
if (error == 0) {
zvol_log_write(zv, tx, off, bytes, sync);
}
Expand Down

0 comments on commit 69a0632

Please sign in to comment.