From cc63068e95ee725cce03b1b7ce50179825a6cda5 Mon Sep 17 00:00:00 2001 From: sanjeevbagewadi Date: Fri, 9 Feb 2018 23:45:53 +0530 Subject: [PATCH] Handle zap_add() failures in mixed case mode With "casesensitivity=mixed", zap_add() could fail when the number of files/directories with the same name (varying in case) exceed the capacity of the leaf node of a Fatzap. This results in a ASSERT() failure as zfs_link_create() does not expect zap_add() to fail. The fix is to handle these failures and rollback the transactions. Reviewed by: Matt Ahrens Reviewed-by: Chunwei Chen Reviewed-by: Brian Behlendorf Signed-off-by: Sanjeev Bagewadi Closes #7011 Closes #7054 --- include/sys/zap_leaf.h | 15 +- module/zfs/zap.c | 25 +++- module/zfs/zap_leaf.c | 2 +- module/zfs/zap_micro.c | 38 ++++- module/zfs/zfs_dir.c | 29 +++- module/zfs/zfs_vnops.c | 74 +++++++--- tests/runfiles/linux.run | 2 +- .../tests/functional/casenorm/Makefile.am | 1 + .../casenorm/mixed_create_failure.ksh | 136 ++++++++++++++++++ 9 files changed, 290 insertions(+), 32 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/casenorm/mixed_create_failure.ksh diff --git a/include/sys/zap_leaf.h b/include/sys/zap_leaf.h index e784c5963b2e..a3da1036a5ee 100644 --- a/include/sys/zap_leaf.h +++ b/include/sys/zap_leaf.h @@ -46,10 +46,15 @@ struct zap_stats; * block size (1<l_bs) - hash entry size (2) * number of hash * entries - header space (2*chunksize) */ -#define ZAP_LEAF_NUMCHUNKS(l) \ - (((1<<(l)->l_bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(l)) / \ +#define ZAP_LEAF_NUMCHUNKS_BS(bs) \ + (((1<<(bs)) - 2*ZAP_LEAF_HASH_NUMENTRIES_BS(bs)) / \ ZAP_LEAF_CHUNKSIZE - 2) +#define ZAP_LEAF_NUMCHUNKS(l) (ZAP_LEAF_NUMCHUNKS_BS(((l)->l_bs))) + +#define ZAP_LEAF_NUMCHUNKS_DEF \ + (ZAP_LEAF_NUMCHUNKS_BS(fzap_default_block_shift)) + /* * The amount of space within the chunk available for the array is: * chunk size - space for type (1) - space for next pointer (2) @@ -74,8 +79,10 @@ struct zap_stats; * which is less than block size / CHUNKSIZE (24) / minimum number of * chunks per entry (3). */ -#define ZAP_LEAF_HASH_SHIFT(l) ((l)->l_bs - 5) -#define ZAP_LEAF_HASH_NUMENTRIES(l) (1 << ZAP_LEAF_HASH_SHIFT(l)) +#define ZAP_LEAF_HASH_SHIFT_BS(bs) ((bs) - 5) +#define ZAP_LEAF_HASH_NUMENTRIES_BS(bs) (1 << ZAP_LEAF_HASH_SHIFT_BS(bs)) +#define ZAP_LEAF_HASH_SHIFT(l) (ZAP_LEAF_HASH_SHIFT_BS(((l)->l_bs))) +#define ZAP_LEAF_HASH_NUMENTRIES(l) (ZAP_LEAF_HASH_NUMENTRIES_BS(((l)->l_bs))) /* * The chunks start immediately after the hash table. The end of the diff --git a/module/zfs/zap.c b/module/zfs/zap.c index 2f6aed667363..1c2030bda303 100644 --- a/module/zfs/zap.c +++ b/module/zfs/zap.c @@ -818,15 +818,19 @@ fzap_lookup(zap_name_t *zn, return (err); } +#define MAX_EXPAND_RETRIES 2 + int fzap_add_cd(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, const void *val, uint32_t cd, void *tag, dmu_tx_t *tx) { zap_leaf_t *l; + zap_leaf_t *prev_l = NULL; int err; zap_entry_handle_t zeh; zap_t *zap = zn->zn_zap; + int expand_retries = 0; ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ASSERT(!zap->zap_ismicro); @@ -850,10 +854,29 @@ fzap_add_cd(zap_name_t *zn, if (err == 0) { zap_increment_num_entries(zap, 1, tx); } else if (err == EAGAIN) { + /* + * If the last two expansions did not help, there is no point + * trying to expand again + */ + if (expand_retries > MAX_EXPAND_RETRIES && prev_l == l) { + err = SET_ERROR(ENOSPC); + goto out; + } + err = zap_expand_leaf(zn, l, tag, tx, &l); zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ - if (err == 0) + if (err == 0) { + prev_l = l; + expand_retries++; goto retry; + } else if (err == ENOSPC) { + /* + * If we failed to expand the leaf, then bailout + * as there is no point trying + * zap_put_leaf_maybe_grow_ptrtbl(). + */ + return (err); + } } out: diff --git a/module/zfs/zap_leaf.c b/module/zfs/zap_leaf.c index 5341fc098b96..661fd747b150 100644 --- a/module/zfs/zap_leaf.c +++ b/module/zfs/zap_leaf.c @@ -53,7 +53,7 @@ static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry); ((h) >> \ (64 - ZAP_LEAF_HASH_SHIFT(l) - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) -#define LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)]) +#define LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)]) extern inline zap_leaf_phys_t *zap_leaf_phys(zap_leaf_t *l); diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index b81a48a0f63d..ad9a62161ed5 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -363,6 +363,41 @@ mze_find_unused_cd(zap_t *zap, uint64_t hash) return (cd); } +/* + * Each mzap entry requires at max : 4 chunks + * 3 chunks for names + 1 chunk for value. + */ +#define MZAP_ENT_CHUNKS (1 + ZAP_LEAF_ARRAY_NCHUNKS(MZAP_NAME_LEN) + \ + ZAP_LEAF_ARRAY_NCHUNKS(sizeof (uint64_t))) + +/* + * Check if the current entry keeps the colliding entries under the fatzap leaf + * size. + */ +static boolean_t +mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash) +{ + zap_t *zap = zn->zn_zap; + mzap_ent_t mze_tofind; + mzap_ent_t *mze; + avl_index_t idx; + avl_tree_t *avl = &zap->zap_m.zap_avl; + uint32_t mzap_ents = 0; + + mze_tofind.mze_hash = hash; + mze_tofind.mze_cd = 0; + + for (mze = avl_find(avl, &mze_tofind, &idx); + mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { + mzap_ents++; + } + + /* Include the new entry being added */ + mzap_ents++; + + return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS)); +} + static void mze_remove(zap_t *zap, mzap_ent_t *mze) { @@ -1190,7 +1225,8 @@ zap_add_impl(zap_t *zap, const char *key, err = fzap_add(zn, integer_size, num_integers, val, tag, tx); zap = zn->zn_zap; /* fzap_add() may change zap */ } else if (integer_size != 8 || num_integers != 1 || - strlen(key) >= MZAP_NAME_LEN) { + strlen(key) >= MZAP_NAME_LEN || + !mze_canfit_fzap_leaf(zn, zn->zn_hash)) { err = mzap_upgrade(&zn->zn_zap, tag, tx, 0); if (err == 0) { err = fzap_add(zn, integer_size, num_integers, val, diff --git a/module/zfs/zfs_dir.c b/module/zfs/zfs_dir.c index 9a8bbccd92d7..6398a1d155e2 100644 --- a/module/zfs/zfs_dir.c +++ b/module/zfs/zfs_dir.c @@ -742,7 +742,11 @@ zfs_dirent(znode_t *zp, uint64_t mode) } /* - * Link zp into dl. Can only fail if zp has been unlinked. + * Link zp into dl. Can fail in the following cases : + * - if zp has been unlinked. + * - if the number of entries with the same hash (aka. colliding entries) + * exceed the capacity of a leaf-block of fatzap and splitting of the + * leaf-block does not help. */ int zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) @@ -776,6 +780,24 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) NULL, &links, sizeof (links)); } } + + value = zfs_dirent(zp, zp->z_mode); + error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, 8, 1, + &value, tx); + + /* + * zap_add could fail to add the entry if it exceeds the capacity of the + * leaf-block and zap_leaf_split() failed to help. + * The caller of this routine is responsible for failing the transaction + * which will rollback the SA updates done above. + */ + if (error != 0) { + if (!(flag & ZRENAMING) && !(flag & ZNEW)) + drop_nlink(ZTOI(zp)); + mutex_exit(&zp->z_lock); + return (error); + } + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &dzp->z_id, sizeof (dzp->z_id)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, @@ -813,11 +835,6 @@ zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) ASSERT(error == 0); mutex_exit(&dzp->z_lock); - value = zfs_dirent(zp, zp->z_mode); - error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, - 8, 1, &value, tx); - ASSERT(error == 0); - return (0); } diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 5e34cb2fff24..e21fe1acae2a 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1429,6 +1429,7 @@ zfs_create(struct inode *dip, char *name, vattr_t *vap, int excl, dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { @@ -1446,10 +1447,22 @@ zfs_create(struct inode *dip, char *name, vattr_t *vap, int excl, } zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + error = zfs_link_create(dl, zp, tx, ZNEW); + if (error != 0) { + /* + * Since, we failed to add the directory entry for it, + * delete the newly created dnode. + */ + zfs_znode_delete(zp, tx); + remove_inode_hash(ZTOI(zp)); + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + goto out; + } + if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); - (void) zfs_link_create(dl, zp, tx, ZNEW); txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); if (flag & FIGNORECASE) txtype |= TX_CI; @@ -2040,13 +2053,18 @@ zfs_mkdir(struct inode *dip, char *dirname, vattr_t *vap, struct inode **ipp, */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); - if (fuid_dirtied) - zfs_fuid_sync(zfsvfs, tx); - /* * Now put new name in parent dir. */ - (void) zfs_link_create(dl, zp, tx, ZNEW); + error = zfs_link_create(dl, zp, tx, ZNEW); + if (error != 0) { + zfs_znode_delete(zp, tx); + remove_inode_hash(ZTOI(zp)); + goto out; + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); *ipp = ZTOI(zp); @@ -2056,6 +2074,7 @@ zfs_mkdir(struct inode *dip, char *dirname, vattr_t *vap, struct inode **ipp, zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, acl_ids.z_fuidp, vap); +out: zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); @@ -2065,10 +2084,14 @@ zfs_mkdir(struct inode *dip, char *dirname, vattr_t *vap, struct inode **ipp, if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); - zfs_inode_update(dzp); - zfs_inode_update(zp); + if (error != 0) { + iput(ZTOI(zp)); + } else { + zfs_inode_update(dzp); + zfs_inode_update(zp); + } ZFS_EXIT(zfsvfs); - return (0); + return (error); } /* @@ -3686,6 +3709,13 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, VERIFY3U(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL), ==, 0); } + } else { + /* + * If we had removed the existing target, subsequent + * call to zfs_link_create() to add back the same entry + * but, the new dnode (szp) should not fail. + */ + ASSERT(tzp == NULL); } } @@ -3856,14 +3886,18 @@ zfs_symlink(struct inode *dip, char *name, vattr_t *vap, char *link, /* * Insert the new object into the directory. */ - (void) zfs_link_create(dl, zp, tx, ZNEW); - - if (flags & FIGNORECASE) - txtype |= TX_CI; - zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); + error = zfs_link_create(dl, zp, tx, ZNEW); + if (error != 0) { + zfs_znode_delete(zp, tx); + remove_inode_hash(ZTOI(zp)); + } else { + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); - zfs_inode_update(dzp); - zfs_inode_update(zp); + zfs_inode_update(dzp); + zfs_inode_update(zp); + } zfs_acl_ids_free(&acl_ids); @@ -3871,10 +3905,14 @@ zfs_symlink(struct inode *dip, char *name, vattr_t *vap, char *link, zfs_dirent_unlock(dl); - *ipp = ZTOI(zp); + if (error == 0) { + *ipp = ZTOI(zp); - if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) - zil_commit(zilog, 0); + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + } else { + iput(ZTOI(zp)); + } ZFS_EXIT(zfsvfs); return (error); diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 6093275a1b4e..43dd5cb1fb84 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -59,7 +59,7 @@ tags = ['functional', 'cachefile'] # 'mixed_none_lookup', 'mixed_none_lookup_ci', 'mixed_none_delete', # 'mixed_formd_lookup', 'mixed_formd_lookup_ci', 'mixed_formd_delete'] [tests/functional/casenorm] -tests = ['case_all_values', 'norm_all_values'] +tests = ['case_all_values', 'norm_all_values', 'mixed_create_failure'] tags = ['functional', 'casenorm'] [tests/functional/channel_program/lua_core] diff --git a/tests/zfs-tests/tests/functional/casenorm/Makefile.am b/tests/zfs-tests/tests/functional/casenorm/Makefile.am index 00a19c7ff77d..00cb59074ed2 100644 --- a/tests/zfs-tests/tests/functional/casenorm/Makefile.am +++ b/tests/zfs-tests/tests/functional/casenorm/Makefile.am @@ -9,6 +9,7 @@ dist_pkgdata_SCRIPTS = \ insensitive_formd_lookup.ksh \ insensitive_none_delete.ksh \ insensitive_none_lookup.ksh \ + mixed_create_failure.ksh \ mixed_formd_delete.ksh \ mixed_formd_lookup_ci.ksh \ mixed_formd_lookup.ksh \ diff --git a/tests/zfs-tests/tests/functional/casenorm/mixed_create_failure.ksh b/tests/zfs-tests/tests/functional/casenorm/mixed_create_failure.ksh new file mode 100755 index 000000000000..51b5bb3f6584 --- /dev/null +++ b/tests/zfs-tests/tests/functional/casenorm/mixed_create_failure.ksh @@ -0,0 +1,136 @@ +#!/bin/ksh -p +# +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# +# Copyright 2018 Nutanix Inc. All rights reserved. +# + +. $STF_SUITE/tests/functional/casenorm/casenorm.kshlib + +# DESCRIPTION: +# For the filesystem with casesensitivity=mixed, normalization=none, +# when multiple files with the same name (differing only in case) are created, +# the number of files is limited to what can fit in a fatzap leaf-block. +# And beyond that, it fails with ENOSPC. +# +# Ensure that the create/rename operations fail gracefully and not trigger an +# ASSERT. +# +# STRATEGY: +# Repeat the below steps for objects: files, directories, symlinks and hardlinks +# 1. Create objects with same name but varying in case. +# E.g. 'abcdefghijklmnop', 'Abcdefghijklmnop', 'ABcdefghijklmnop' etc. +# The create should fail with ENOSPC. +# 2. Create an object with name 'tmp_obj' and try to rename it to name that we +# failed to add in step 1 above. +# This should fail as well. + +verify_runnable "global" + +function cleanup +{ + destroy_testfs +} + +log_onexit cleanup +log_assert "With mixed mode: ensure create fails with ENOSPC beyond a certain limit" + +create_testfs "-o casesensitivity=mixed -o normalization=none" + +# Different object types +obj_type=('file' 'dir' 'symlink' 'hardlink') + +# Commands to create different object types +typeset -A ops +ops['file']='touch' +ops['dir']='mkdir' +ops['symlink']='ln -s' +ops['hardlink']='ln' + +# This function tests the following for a give object type : +# - Create multiple objects with the same name (varying only in case). +# Ensure that it eventually fails once the leaf-block limit is exceeded. +# - Create another object with a different name. And attempt rename it to the +# name (for which the create had failed in the previous step). +# This should fail as well. +# Args : +# $1 - object type (file/dir/symlink/hardlink) +# $2 - test directory +# +function test_ops +{ + typeset obj_type=$1 + typeset testdir=$2 + + target_obj='target-file' + + op="${ops[$obj_type]}" + + log_note "The op : $op" + log_note "testdir=$testdir obj_type=$obj_type" + + test_path="$testdir/$obj_type" + mkdir $test_path + log_note "Created test dir $test_path" + + if [[ $obj_type = "symlink" || $obj_type = "hardlink" ]]; then + touch $test_path/$target_obj + log_note "Created target: $test_path/$target_obj" + op="$op $test_path/$target_obj" + fi + + log_note "op : $op" + names='{a,A}{b,B}{c,C}{d,D}{e,E}{f,F}{g,G}{h,H}{i,I}{j,J}{k,K}{l,L}' + for name in $names; do + cmd="$op $test_path/$name" + out=$($cmd 2>&1) + ret=$? + log_note "cmd: $cmd ret: $ret out=$out" + if (($ret != 0)); then + if [[ $out = *@(No space left on device)* ]]; then + save_name="$test_path/$name" + break; + else + log_err "$cmd failed with unexpected error : $out" + fi + fi + done + + log_note 'Test rename \"sample_name\" rename' + TMP_OBJ="$test_path/tmp_obj" + cmd="$op $TMP_OBJ" + out=$($cmd 2>&1) + ret=$? + if (($ret != 0)); then + log_err "cmd:$cmd failed out:$out" + fi + + # Now, try to rename the tmp_obj to the name which we failed to add earlier. + # This should fail as well. + out=$(mv $TMP_OBJ $save_name 2>&1) + ret=$? + if (($ret != 0)); then + if [[ $out = *@(No space left on device)* ]]; then + log_note "$cmd failed as expected : $out" + else + log_err "$cmd failed with : $out" + fi + fi +} + +for obj_type in ${obj_type[*]}; +do + log_note "Testing create of $obj_type" + test_ops $obj_type $TESTDIR +done + +log_pass "Mixed mode FS: Ops on large number of colliding names fail gracefully"