From 79e9a454684892da0b8faa6ede6344ba8a4c0b90 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Tue, 25 Mar 2014 15:41:18 -0400 Subject: [PATCH] Fix deadlock in zfs_zget() zfsonlinux/zfs#180 occurred because of a race between inode eviction and zfs_zget(). zfsonlinux/zfs@36df284366caa77cb40083d2e6bcce02274e2f05 tried to address it by making an upcall to the VFS to learn whether an inode is being evicted and spin until it leaves eviction. This is a hack around the fact that we cannot ensure "SA" does immediate eviction by hooking into generic_drop_inode(), which is GPL exported and cannot be wrapped. Unfortunately, the act of calling ilookup to avoid this race during writeback creates a deadlock: [ 602.268492] INFO: task kworker/u24:6:891 blocked for more than 120 seconds. [ 602.268496] Tainted: P O 3.13.6 #1 [ 602.268498] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 602.268500] kworker/u24:6 D ffff88107fcd2e80 0 891 2 0x00000000 [ 602.268511] Workqueue: writeback bdi_writeback_workfn (flush-zfs-5) [ 602.268522] ffff8810370ff950 0000000000000002 ffff88103853d940 0000000000012e80 [ 602.268526] ffff8810370fffd8 0000000000012e80 ffff88103853d940 ffff880f5c8be098 [ 602.268530] ffff88107ffb6950 ffff8810370ff980 ffff88103a9a5b78 0000000000000000 [ 602.268534] Call Trace: [ 602.268541] [] schedule+0x24/0x70 [ 602.268546] [] __wait_on_freeing_inode+0x99/0xc0 [ 602.268552] [] ? autoremove_wake_function+0x40/0x40 [ 602.268555] [] find_inode_fast+0x78/0xb0 [ 602.268559] [] ilookup+0x65/0xd0 [ 602.268590] [] zfs_zget+0xdb/0x260 [zfs] [ 602.268594] [] ? __mutex_lock_slowpath+0x21b/0x360 [ 602.268613] [] zfs_get_data+0x46/0x340 [zfs] [ 602.268631] [] zil_add_block+0xa31/0xc00 [zfs] [ 602.268634] [] ? mutex_unlock+0x9/0x10 [ 602.268651] [] zil_commit+0x12/0x20 [zfs] [ 602.268669] [] zpl_putpage+0x174/0x840 [zfs] [ 602.268674] [] do_writepages+0x1c/0x40 [ 602.268677] [] __writeback_single_inode+0x3b/0x2b0 [ 602.268680] [] writeback_sb_inodes+0x247/0x420 [ 602.268684] [] wb_writeback+0xe3/0x320 [ 602.268689] [] ? set_worker_desc+0x71/0x80 [ 602.268692] [] bdi_writeback_workfn+0xfe/0x490 [ 602.268696] [] ? _raw_spin_unlock_irq+0x14/0x40 [ 602.268700] [] ? finish_task_switch+0x59/0x130 [ 602.268703] [] process_one_work+0x16c/0x490 [ 602.268706] [] worker_thread+0x113/0x390 [ 602.268710] [] ? manage_workers.isra.27+0x2a0/0x2a0 [ 602.268713] [] kthread+0xdf/0x100 [ 602.268717] [] ? arch_vtime_task_switch+0x8e/0xa0 [ 602.268720] [] ? kthread_create_on_node+0x190/0x190 [ 602.268723] [] ret_from_fork+0x7c/0xb0 [ 602.268730] [] ? kthread_create_on_node+0x190/0x190 The return value from igrab() gives us the information that ifind() provided without the risk of a deadlock. Ideally, we should ask upstream to export generic_drop_inode() so that we can wrap it to properly handle this situation, but until then, lets hook into the return value of ifind() so that we do not deadlock here. In addition, zfs_zget() should exit with a hold on the inode, but that was never present in the Linux port. iThis can lead to undefined behavior, such as inodes that are evicted when they have users. The function is modified to ensure that successful exit from this function has a hold on the inode, which the code expects. Signed-off-by: Richard Yao Please enter the commit message for your changes. Lines starting --- module/zfs/zfs_znode.c | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/module/zfs/zfs_znode.c b/module/zfs/zfs_znode.c index 5e9941034c1c..2ed24590c299 100644 --- a/module/zfs/zfs_znode.c +++ b/module/zfs/zfs_znode.c @@ -859,19 +859,15 @@ zfs_zget(zfs_sb_t *zsb, uint64_t obj_num, znode_t **zpp) znode_t *zp; int err; sa_handle_t *hdl; - struct inode *ip; *zpp = NULL; again: - ip = ilookup(zsb->z_sb, obj_num); - ZFS_OBJ_HOLD_ENTER(zsb, obj_num); err = sa_buf_hold(zsb->z_os, obj_num, NULL, &db); if (err) { ZFS_OBJ_HOLD_EXIT(zsb, obj_num); - iput(ip); return (err); } @@ -882,25 +878,11 @@ zfs_zget(zfs_sb_t *zsb, uint64_t obj_num, znode_t **zpp) doi.doi_bonus_size < sizeof (znode_phys_t)))) { sa_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zsb, obj_num); - iput(ip); return (SET_ERROR(EINVAL)); } hdl = dmu_buf_get_user(db); if (hdl != NULL) { - if (ip == NULL) { - /* - * ilookup returned NULL, which means - * the znode is dying - but the SA handle isn't - * quite dead yet, we need to drop any locks - * we're holding, re-schedule the task and try again. - */ - sa_buf_rele(db, NULL); - ZFS_OBJ_HOLD_EXIT(zsb, obj_num); - - schedule(); - goto again; - } zp = sa_get_userdata(hdl); @@ -917,19 +899,31 @@ zfs_zget(zfs_sb_t *zsb, uint64_t obj_num, znode_t **zpp) if (zp->z_unlinked) { err = SET_ERROR(ENOENT); } else { - igrab(ZTOI(zp)); + /* + * if igrab returns NULL, the znode is being evicted, + * but the SA handle did not do immediate eviction like + * we expected because GPL symbol exportrestrictions + * prevent us from hooking into ->drop_inode() at this + * time. As a workaround, we drop any locks we're + * holding, re-schedule the task and try again. + */ + if (igrab(ZTOI(zp)) == NULL) { + mutex_exit(&zp->z_lock); + sa_buf_rele(db, NULL); + ZFS_OBJ_HOLD_EXIT(zsb, obj_num); + + schedule(); + goto again; + } *zpp = zp; err = 0; } sa_buf_rele(db, NULL); mutex_exit(&zp->z_lock); ZFS_OBJ_HOLD_EXIT(zsb, obj_num); - iput(ip); return (err); } - ASSERT3P(ip, ==, NULL); - /* * Not found create new znode/vnode but only if file exists. *