From 26cb948007ff6ec2781aed17915cebc93768ce7f Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 5 Aug 2014 13:46:49 -0700 Subject: [PATCH] Avoid 128K kmem allocations in mzap_upgrade() As originally implemented the mzap_upgrade() function will perform up to SPA_MAXBLOCKSIZE allocations using kmem_alloc(). These large allocations can potentially block indefinitely if contiguous memory is not available. Since this allocation is done under the zap->zap_rwlock it can appear as if there is a deadlock in zap_lockdir(). This is shown below. The optimal fix for this would be to rework mzap_upgrade() such that no longer allocations are required. This could be done but it would result in us diverging further from the other implementations. Therefore I've opted against doing this unless it becomes absolutely necessary. Instead mzap_upgrade() has been updated to use zio_buf_alloc() which can reliably provide buffers of up to SPA_MAXBLOCKSIZE. filebench R running task 0 15523 15521 0x0000008c Call Trace: [] fallback_alloc+0x1ba/0x270 [] cache_grow+0x2cf/0x320 [] ____cache_alloc_node+0x99/0x160 [] kmem_alloc_debug+0x251/0x490 [spl] [] __kmalloc+0x189/0x220 [] kmem_alloc_debug+0x251/0x490 [spl] [] mzap_upgrade+0xca/0x310 [zfs] [] zap_lockdir+0xab9/0xbb0 [zfs] [] zap_add+0x50/0x1c0 [zfs] [] zap_add_int+0x7a/0xa0 [zfs] [] zfs_unlinked_add+0x5f/0x110 [zfs] [] zfs_rmnode+0x1f1/0x410 [zfs] [] zfs_zinactive+0xfe/0x200 [zfs] [] zfs_inactive+0x7f/0x370 [zfs] [] zpl_inode_delete+0x0/0x30 [zfs] [] zpl_clear_inode+0xe/0x10 [zfs] [] clear_inode+0xac/0x140 [] zpl_inode_delete+0x20/0x30 [zfs] [] generic_delete_inode+0xde/0x1d0 [] generic_drop_inode+0x65/0x80 [] iput+0x62/0x70 [] do_unlinkat+0x1a9/0x260 [] sys_unlink+0x16/0x20 [] system_call_fastpath+0x16/0x1b kswapd0 D 0000000000000000 0 59 2 0x00000000 Call Trace: [] rwsem_down_failed_common+0x95/0x1d0 [] ? refcount_remove+0x16/0x20 [zfs] [] rwsem_down_write_failed+0x23/0x30 [] call_rwsem_down_write_failed+0x13/0x20 [] zap_lockdir+0x229/0xbb0 [zfs] [] zap_remove_norm+0x48/0x2d0 [zfs] [] zap_remove+0x13/0x20 [zfs] [] zap_remove_int+0x61/0x90 [zfs] [] zfs_rmnode+0x20c/0x410 [zfs] [] zfs_zinactive+0xfe/0x200 [zfs] [] zfs_inactive+0x7f/0x370 [zfs] [] zpl_clear_inode+0xe/0x10 [zfs] [] clear_inode+0xac/0x140 [] dispose_list+0x40/0x120 [] shrink_icache_memory+0x274/0x2e0 [] shrink_slab+0x12a/0x1a0 [] balance_pgdat+0x59a/0x820 [] kswapd+0x134/0x3b0 [] kthread+0x96/0xa0 Signed-off-by: Brian Behlendorf --- module/zfs/zap_micro.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index 68fb747697d2..2249b7338027 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -533,7 +533,7 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags) ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); sz = zap->zap_dbuf->db_size; - mzp = kmem_alloc(sz, KM_PUSHPAGE | KM_NODEBUG); + mzp = zio_buf_alloc(sz); bcopy(zap->zap_dbuf->db_data, mzp, sz); nchunks = zap->zap_m.zap_num_chunks; @@ -541,7 +541,7 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags) err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, 1ULL << fzap_default_block_shift, 0, tx); if (err) { - kmem_free(mzp, sz); + zio_buf_free(mzp, sz); return (err); } } @@ -567,7 +567,7 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags) if (err) break; } - kmem_free(mzp, sz); + zio_buf_free(mzp, sz); *zapp = zap; return (err); }