From 4d010b4738c6c16011f48020431cd89dbc69394e Mon Sep 17 00:00:00 2001 From: Jeff Dike Date: Thu, 1 Aug 2019 09:13:11 -0400 Subject: [PATCH] Fix lockdep circular locking false positive involving sa_lock This patch is an RFC. There are at least two things it could be getting wrong: 1 - The use of a mutex to protect an increment; I couldn't see how to get atomic operations in this code 2 - There could easily be a better fix than putting each newly allocated sa_os_t in its own lockdep class This fix does eliminate the lockdep warnings, so there's that. The two lockdep reports are below. They show two different deadlock scenarios, but they share a common link, which is thread 1 holding sa_lock and trying to get zap->zap_rwlock: zap_lockdir_impl+0x858/0x16c0 [zfs] zap_lockdir+0xd2/0x100 [zfs] zap_lookup_norm+0x7f/0x100 [zfs] zap_lookup+0x12/0x20 [zfs] sa_setup+0x902/0x1380 [zfs] zfsvfs_init+0x3d6/0xb20 [zfs] zfsvfs_create+0x5dd/0x900 [zfs] zfs_domount+0xa3/0xe20 [zfs] thread 2 trying to get sa_lock, either in sa_setup: sa_setup+0x742/0x1380 [zfs] zfsvfs_init+0x3d6/0xb20 [zfs] zfsvfs_create+0x5dd/0x900 [zfs] zfs_domount+0xa3/0xe20 [zfs] or in sa_build_index: sa_build_index+0x13d/0x790 [zfs] sa_handle_get_from_db+0x368/0x500 [zfs] zfs_znode_sa_init.isra.0+0x24b/0x330 [zfs] zfs_znode_alloc+0x3da/0x1a40 [zfs] zfs_zget+0x39a/0x6e0 [zfs] zfs_root+0x101/0x160 [zfs] zfs_domount+0x91f/0xea0 [zfs] AFAICT, sa_os_t is unique to its zfsvfs, so if we have two stacks calling zfs_domount, each has a different zfsvfs and thus a different sa, and there is no real deadlock here. The sa_setup vs sa_setup case is easy, since each is referring to a newly allocated sa_os_t. In the sa_build_index vs sa_setup case, we need to reason that the sa_os_t is unique to a zfsvfs. ====================================================== WARNING: possible circular locking dependency detected 4.19.55-4.19.2-debug-b494b4b34cd8ef26 #1 Tainted: G W O ------------------------------------------------------ kswapd0/716 is trying to acquire lock: 00000000ac111d4a (&zfsvfs->z_teardown_inactive_lock){.+.+}, at: zfs_inactive+0x132/0xb40 [zfs] but task is already holding lock: 00000000218b764d (fs_reclaim){+.+.}, at: __fs_reclaim_acquire+0x5/0x30 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #4 (fs_reclaim){+.+.}: kmem_cache_alloc_node_trace+0x43/0x380 __kmalloc_node+0x3c/0x60 spl_kmem_alloc+0xd9/0x1f0 [spl] zap_name_alloc+0x34/0x480 [zfs] zap_lookup_impl+0x27/0x3a0 [zfs] zap_lookup_norm+0xb9/0x100 [zfs] zap_lookup+0x12/0x20 [zfs] dsl_dir_hold+0x341/0x660 [zfs] dsl_dataset_hold+0xb6/0x6c0 [zfs] dmu_objset_hold+0xca/0x120 [zfs] zpl_mount+0x90/0x3b0 [zfs] mount_fs+0x86/0x2b0 vfs_kern_mount+0x68/0x3c0 do_mount+0x306/0x2550 ksys_mount+0x7e/0xd0 __x64_sys_mount+0xba/0x150 do_syscall_64+0x9b/0x410 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #3 (&zap->zap_rwlock){++++}: zap_lockdir_impl+0x7ed/0x15c0 [zfs] zap_lockdir+0xd2/0x100 [zfs] zap_lookup_norm+0x7f/0x100 [zfs] zap_lookup+0x12/0x20 [zfs] sa_setup+0x902/0x1380 [zfs] zfsvfs_init+0x3d6/0xb20 [zfs] zfsvfs_create+0x5dd/0x900 [zfs] zfs_domount+0xa3/0xe20 [zfs] zpl_mount+0x270/0x3b0 [zfs] mount_fs+0x86/0x2b0 vfs_kern_mount+0x68/0x3c0 do_mount+0x306/0x2550 ksys_mount+0x7e/0xd0 __x64_sys_mount+0xba/0x150 do_syscall_64+0x9b/0x410 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #2 (&sa->sa_lock){+.+.}: sa_setup+0x742/0x1380 [zfs] zfsvfs_init+0x3d6/0xb20 [zfs] zfsvfs_create+0x5dd/0x900 [zfs] zfs_domount+0xa3/0xe20 [zfs] zpl_mount+0x270/0x3b0 [zfs] mount_fs+0x86/0x2b0 vfs_kern_mount+0x68/0x3c0 do_mount+0x306/0x2550 ksys_mount+0x7e/0xd0 __x64_sys_mount+0xba/0x150 do_syscall_64+0x9b/0x410 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #1 (&os->os_user_ptr_lock){+.+.}: zfs_get_vfs_flag_unmounted+0x63/0x3c0 [zfs] dmu_free_long_range+0x963/0xda0 [zfs] zfs_rmnode+0x719/0x9c0 [zfs] zfs_inactive+0x306/0xb40 [zfs] zpl_evict_inode+0xa7/0x140 [zfs] evict+0x212/0x570 do_unlinkat+0x2e6/0x540 do_syscall_64+0x9b/0x410 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #0 (&zfsvfs->z_teardown_inactive_lock){.+.+}: down_read+0x3f/0xe0 zfs_inactive+0x132/0xb40 [zfs] zpl_evict_inode+0xa7/0x140 [zfs] evict+0x212/0x570 dispose_list+0xfa/0x1d0 prune_icache_sb+0xd3/0x140 super_cache_scan+0x292/0x440 do_shrink_slab+0x2b9/0x800 shrink_slab+0x195/0x410 shrink_node+0x2e1/0x10f0 kswapd+0x71c/0x11c0 kthread+0x2e7/0x3e0 ret_from_fork+0x3a/0x50 other info that might help us debug this: Chain exists of: &zfsvfs->z_teardown_inactive_lock --> &zap->zap_rwlock --> fs_reclaim Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(&zap->zap_rwlock); lock(fs_reclaim); lock(&zfsvfs->z_teardown_inactive_lock); *** DEADLOCK *** 3 locks held by kswapd0/716: #0: 00000000218b764d (fs_reclaim){+.+.}, at: __fs_reclaim_acquire+0x5/0x30 #1: 00000000f9a6bfa1 (shrinker_rwsem){++++}, at: shrink_slab+0x109/0x410 #2: 0000000076154958 (&type->s_umount_key#50){.+.+}, at: trylock_super+0x16/0xc0 stack backtrace: CPU: 5 PID: 716 Comm: kswapd0 Tainted: G W O 4.19.55-4.19.2-debug-b494b4b34cd8ef26 #1 Hardware name: Dell Inc. PowerEdge R510/0W844P, BIOS 1.1.4 11/04/2009 Call Trace: dump_stack+0x91/0xeb print_circular_bug.isra.16+0x30b/0x5b0 ? save_trace+0xd6/0x240 __lock_acquire+0x41be/0x4f10 ? debug_show_all_locks+0x2d0/0x2d0 ? sched_clock_cpu+0x133/0x170 ? lock_acquire+0x153/0x330 lock_acquire+0x153/0x330 ? zfs_inactive+0x132/0xb40 [zfs] down_read+0x3f/0xe0 ? zfs_inactive+0x132/0xb40 [zfs] zfs_inactive+0x132/0xb40 [zfs] ? zfs_dirty_inode+0xa20/0xa20 [zfs] ? _raw_spin_unlock_irq+0x2d/0x40 zpl_evict_inode+0xa7/0x140 [zfs] evict+0x212/0x570 dispose_list+0xfa/0x1d0 ? list_lru_walk_one+0x9c/0xd0 prune_icache_sb+0xd3/0x140 ? invalidate_inodes+0x370/0x370 ? list_lru_count_one+0x179/0x310 super_cache_scan+0x292/0x440 do_shrink_slab+0x2b9/0x800 shrink_slab+0x195/0x410 ? unregister_shrinker+0x290/0x290 shrink_node+0x2e1/0x10f0 ? shrink_node_memcg+0x1230/0x1230 ? zone_watermark_ok_safe+0x35/0x270 ? lock_acquire+0x153/0x330 ? __fs_reclaim_acquire+0x5/0x30 ? pgdat_balanced+0x91/0xd0 kswapd+0x71c/0x11c0 ? mem_cgroup_shrink_node+0x460/0x460 ? sched_clock_cpu+0x133/0x170 ? _raw_spin_unlock_irq+0x29/0x40 ? wait_woken+0x260/0x260 ? check_flags.part.23+0x480/0x480 ? __kthread_parkme+0xad/0x180 ? mem_cgroup_shrink_node+0x460/0x460 kthread+0x2e7/0x3e0 ? kthread_park+0x120/0x120 ret_from_fork+0x3a/0x50 ====================================================== WARNING: possible circular locking dependency detected 4.19.55-4.19.2-debug-b494b4b34cd8ef26 #1 Tainted: G O ------------------------------------------------------ mount.zfs/3249 is trying to acquire lock: 000000000347bea0 (&zp->z_lock){+.+.}, at: zpl_mmap+0x27e/0x550 [zfs] but task is already holding lock: 00000000224314a3 (&mm->mmap_sem){++++}, at: vm_mmap_pgoff+0x118/0x190 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #6 (&mm->mmap_sem){++++}: _copy_from_user+0x20/0xd0 scsi_cmd_ioctl+0x47d/0x620 cdrom_ioctl+0x10b/0x29b0 sr_block_ioctl+0x107/0x150 [sr_mod] blkdev_ioctl+0x946/0x1600 block_ioctl+0xdd/0x130 do_vfs_ioctl+0x176/0xf70 ksys_ioctl+0x66/0x70 __x64_sys_ioctl+0x6f/0xb0 do_syscall_64+0x9b/0x410 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #5 (sr_mutex){+.+.}: sr_block_open+0x104/0x1a0 [sr_mod] __blkdev_get+0x249/0x11c0 blkdev_get+0x280/0x7a0 do_dentry_open+0x7ee/0x1020 path_openat+0x11a7/0x2500 do_filp_open+0x17f/0x260 do_sys_open+0x195/0x300 __se_sys_open+0xbf/0xf0 do_syscall_64+0x9b/0x410 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #4 (&bdev->bd_mutex){+.+.}: __blkdev_get+0x383/0x11c0 blkdev_get+0x3bc/0x7a0 blkdev_get_by_path+0x73/0xc0 vdev_disk_open+0x4c8/0x12e0 [zfs] vdev_open+0x34c/0x13e0 [zfs] vdev_open_child+0x46/0xd0 [zfs] taskq_thread+0x979/0x1480 [spl] kthread+0x2e7/0x3e0 ret_from_fork+0x3a/0x50 -> #3 (&vd->vd_lock){++++}: vdev_disk_io_start+0x13e/0x2230 [zfs] zio_vdev_io_start+0x358/0x990 [zfs] zio_nowait+0x1f4/0x3a0 [zfs] vdev_mirror_io_start+0x211/0x7b0 [zfs] zio_vdev_io_start+0x7d3/0x990 [zfs] zio_nowait+0x1f4/0x3a0 [zfs] arc_read+0x1782/0x43a0 [zfs] dbuf_read_impl.constprop.13+0xcb4/0x1fe0 [zfs] dbuf_read+0x2c8/0x12a0 [zfs] dmu_buf_hold_by_dnode+0x6d/0xd0 [zfs] zap_get_leaf_byblk.isra.6.part.7+0xd3/0x9d0 [zfs] zap_deref_leaf+0x1f3/0x290 [zfs] fzap_lookup+0x13c/0x340 [zfs] zap_lookup_impl+0x84/0x3a0 [zfs] zap_lookup_norm+0xb9/0x100 [zfs] zap_lookup+0x12/0x20 [zfs] spa_dir_prop+0x56/0xa0 [zfs] spa_ld_trusted_config+0xd0/0xe70 [zfs] spa_ld_mos_with_trusted_config+0x2b/0xb0 [zfs] spa_load+0x14d/0x27d0 [zfs] spa_tryimport+0x32e/0xa90 [zfs] zfs_ioc_pool_tryimport+0x107/0x190 [zfs] zfsdev_ioctl+0x1047/0x1370 [zfs] do_vfs_ioctl+0x176/0xf70 ksys_ioctl+0x66/0x70 __x64_sys_ioctl+0x6f/0xb0 do_syscall_64+0x9b/0x410 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #2 (&zap->zap_rwlock){++++}: zap_lockdir_impl+0x858/0x16c0 [zfs] zap_lockdir+0xd2/0x100 [zfs] zap_lookup_norm+0x7f/0x100 [zfs] zap_lookup+0x12/0x20 [zfs] sa_setup+0x902/0x1380 [zfs] zfsvfs_init+0x6c8/0xc70 [zfs] zfsvfs_create_impl+0x5cf/0x970 [zfs] zfsvfs_create+0xc6/0x130 [zfs] zfs_domount+0x16f/0xea0 [zfs] zpl_mount+0x270/0x3b0 [zfs] mount_fs+0x86/0x2b0 vfs_kern_mount+0x68/0x3c0 do_mount+0x306/0x2550 ksys_mount+0x7e/0xd0 __x64_sys_mount+0xba/0x150 do_syscall_64+0x9b/0x410 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #1 (&sa->sa_lock){+.+.}: sa_build_index+0x13d/0x790 [zfs] sa_handle_get_from_db+0x368/0x500 [zfs] zfs_znode_sa_init.isra.0+0x24b/0x330 [zfs] zfs_znode_alloc+0x3da/0x1a40 [zfs] zfs_zget+0x39a/0x6e0 [zfs] zfs_root+0x101/0x160 [zfs] zfs_domount+0x91f/0xea0 [zfs] zpl_mount+0x270/0x3b0 [zfs] mount_fs+0x86/0x2b0 vfs_kern_mount+0x68/0x3c0 do_mount+0x306/0x2550 ksys_mount+0x7e/0xd0 __x64_sys_mount+0xba/0x150 do_syscall_64+0x9b/0x410 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #0 (&zp->z_lock){+.+.}: __mutex_lock+0xef/0x1380 zpl_mmap+0x27e/0x550 [zfs] mmap_region+0x8fa/0x1150 do_mmap+0x89a/0xd60 vm_mmap_pgoff+0x14a/0x190 ksys_mmap_pgoff+0x16b/0x490 do_syscall_64+0x9b/0x410 entry_SYSCALL_64_after_hwframe+0x49/0xbe other info that might help us debug this: Chain exists of: &zp->z_lock --> sr_mutex --> &mm->mmap_sem Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&mm->mmap_sem); lock(sr_mutex); lock(&mm->mmap_sem); lock(&zp->z_lock); *** DEADLOCK *** 1 lock held by mount.zfs/3249: #0: 00000000224314a3 (&mm->mmap_sem){++++}, at: vm_mmap_pgoff+0x118/0x190 stack backtrace: CPU: 3 PID: 3249 Comm: mount.zfs Tainted: G O 4.19.55-4.19.2-debug-b494b4b34cd8ef26 #1 Hardware name: Dell Inc. PowerEdge R510/0W844P, BIOS 1.1.4 11/04/2009 Call Trace: dump_stack+0x91/0xeb print_circular_bug.isra.16+0x30b/0x5b0 ? save_trace+0xd6/0x240 __lock_acquire+0x41be/0x4f10 ? debug_show_all_locks+0x2d0/0x2d0 ? sched_clock_cpu+0x18/0x170 ? sched_clock_cpu+0x18/0x170 ? __lock_acquire+0xe3b/0x4f10 ? reacquire_held_locks+0x191/0x430 ? reacquire_held_locks+0x191/0x430 ? lock_acquire+0x153/0x330 lock_acquire+0x153/0x330 ? zpl_mmap+0x27e/0x550 [zfs] ? zpl_mmap+0x27e/0x550 [zfs] __mutex_lock+0xef/0x1380 ? zpl_mmap+0x27e/0x550 [zfs] ? __mutex_add_waiter+0x160/0x160 ? zpl_mmap+0x27e/0x550 [zfs] ? sched_clock+0x5/0x10 ? sched_clock_cpu+0x18/0x170 ? __mutex_add_waiter+0x160/0x160 ? touch_atime+0xcd/0x230 ? atime_needs_update+0x540/0x540 ? do_raw_spin_unlock+0x54/0x250 ? zpl_mmap+0x27e/0x550 [zfs] zpl_mmap+0x27e/0x550 [zfs] ? memset+0x1f/0x40 mmap_region+0x8fa/0x1150 ? arch_get_unmapped_area+0x460/0x460 ? vm_brk+0x10/0x10 ? lock_acquire+0x153/0x330 ? lock_acquire+0x153/0x330 ? security_mmap_addr+0x56/0x80 ? get_unmapped_area+0x222/0x350 do_mmap+0x89a/0xd60 ? proc_keys_start+0x3d0/0x3d0 vm_mmap_pgoff+0x14a/0x190 ? vma_is_stack_for_current+0x90/0x90 ? __ia32_sys_dup3+0xb0/0xb0 ? vfs_statx_fd+0x49/0x80 ? __se_sys_newfstat+0x75/0xa0 ksys_mmap_pgoff+0x16b/0x490 ? find_mergeable_anon_vma+0x90/0x90 ? trace_hardirqs_on_thunk+0x1a/0x1c ? do_syscall_64+0x18/0x410 do_syscall_64+0x9b/0x410 entry_SYSCALL_64_after_hwframe+0x49/0xbe Not-signed-off-by: Jeff Dike --- module/zfs/sa.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/module/zfs/sa.c b/module/zfs/sa.c index 56a606962a7f..5d1e734653c9 100644 --- a/module/zfs/sa.c +++ b/module/zfs/sa.c @@ -231,6 +231,9 @@ sa_cache_destructor(void *buf, void *unused) mutex_destroy(&hdl->sa_lock); } +static kmutex_t sa_id_lock; +static int sa_id = 0; + void sa_cache_init(void) { @@ -1017,8 +1020,12 @@ sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count, mutex_init(&sa->sa_lock, NULL, MUTEX_DEFAULT, NULL); sa->sa_master_obj = sa_obj; + mutex_enter(&sa_id_lock); + sa_id++; + mutex_exit(&sa_id_lock); + os->os_sa = sa; - mutex_enter(&sa->sa_lock); + mutex_enter_nested(&sa->sa_lock, sa_id); mutex_exit(&os->os_user_ptr_lock); avl_create(&sa->sa_layout_num_tree, layout_num_compare, sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node));