diff --git a/config/kernel-rename.m4 b/config/kernel-rename.m4 index f707391539d8..976074a264a7 100644 --- a/config/kernel-rename.m4 +++ b/config/kernel-rename.m4 @@ -4,7 +4,7 @@ dnl # iops->rename2() merged into iops->rename(), and iops->rename() now wants dnl # flags. dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_RENAME_WANTS_FLAGS], [ - ZFS_LINUX_TEST_SRC([inode_operations_rename], [ + ZFS_LINUX_TEST_SRC([inode_operations_rename_flags], [ #include int rename_fn(struct inode *sip, struct dentry *sdp, struct inode *tip, struct dentry *tdp, @@ -19,7 +19,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_RENAME_WANTS_FLAGS], [ AC_DEFUN([ZFS_AC_KERNEL_RENAME_WANTS_FLAGS], [ AC_MSG_CHECKING([whether iops->rename() wants flags]) - ZFS_LINUX_TEST_RESULT([inode_operations_rename], [ + ZFS_LINUX_TEST_RESULT([inode_operations_rename_flags], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_RENAME_WANTS_FLAGS, 1, [iops->rename() wants flags]) @@ -27,3 +27,31 @@ AC_DEFUN([ZFS_AC_KERNEL_RENAME_WANTS_FLAGS], [ AC_MSG_RESULT(no) ]) ]) + +dnl # +dnl # iops->rename2() is supported since Linux 3.5 +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_HAS_RENAME2], [ + ZFS_LINUX_TEST_SRC([inode_operations_rename], [ + #include + int rename2_fn(struct inode *sip, struct dentry *sdp, + struct inode *tip, struct dentry *tdp, + unsigned int flags) { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .rename2 = rename2_fn, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_HAS_RENAME2], [ + AC_MSG_CHECKING([whether iops->rename() wants flags]) + ZFS_LINUX_TEST_RESULT([inode_operations_rename], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_RENAME2, 1, + [iops->rename2() exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index b22a00cdd130..9a5bc7e7ff24 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -133,6 +133,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_KUIDGID_T ZFS_AC_KERNEL_SRC_KUID_HELPERS ZFS_AC_KERNEL_SRC_MODULE_PARAM_CALL_CONST + ZFS_AC_KERNEL_SRC_HAS_RENAME2 ZFS_AC_KERNEL_SRC_RENAME_WANTS_FLAGS ZFS_AC_KERNEL_SRC_CURRENT_TIME ZFS_AC_KERNEL_SRC_USERNS_CAPABILITIES @@ -250,6 +251,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_KUIDGID_T ZFS_AC_KERNEL_KUID_HELPERS ZFS_AC_KERNEL_MODULE_PARAM_CALL_CONST + ZFS_AC_KERNEL_HAS_RENAME2 ZFS_AC_KERNEL_RENAME_WANTS_FLAGS ZFS_AC_KERNEL_CURRENT_TIME ZFS_AC_KERNEL_USERNS_CAPABILITIES diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h index 734499341fd1..a78a7d23da46 100644 --- a/include/sys/zfs_znode.h +++ b/include/sys/zfs_znode.h @@ -402,6 +402,13 @@ extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *dzp, znode_t *zp, char *name); extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *dzp, znode_t *zp, char *name, char *link); +extern void zfs_log_rename_exchange(zilog_t *zilog, dmu_tx_t *tx, + uint64_t txtype, znode_t *sdzp, char *sname, znode_t *tdzp, + char *dname, znode_t *szp); +extern void zfs_log_rename_whiteout(zilog_t *zilog, dmu_tx_t *tx, + uint64_t txtype, znode_t *sdzp, char *sname, znode_t *tdzp, + char *dname, znode_t *szp, znode_t *wzp, vsecattr_t *vsecp, + zfs_fuid_info_t *fuidp, vattr_t *vap); extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp); extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, diff --git a/include/sys/zil.h b/include/sys/zil.h index fed9dfe3a4e9..e84fb864abc3 100644 --- a/include/sys/zil.h +++ b/include/sys/zil.h @@ -162,9 +162,7 @@ typedef enum zil_create { #define TX_MKDIR_ATTR 18 /* mkdir with attr */ #define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */ #define TX_WRITE2 20 /* dmu_sync EALREADY write */ -#define TX_EXCHANGE 21 /* Exchange two paths */ -#define TX_WHITEOUT 22 /* Rename a file, leaving a whiteout */ -#define TX_MAX_TYPE 23 /* Max transaction type */ +#define TX_MAX_TYPE 21 /* Max transaction type */ /* * The transactions for mkdir, symlink, remove, rmdir, link, and rename diff --git a/module/os/linux/zfs/zfs_dir.c b/module/os/linux/zfs/zfs_dir.c index 9aa93a53e3f4..ad410befc04a 100644 --- a/module/os/linux/zfs/zfs_dir.c +++ b/module/os/linux/zfs/zfs_dir.c @@ -1015,7 +1015,8 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, } /* The only error is !zfs_dirempty() and we checked earlier. */ - ASSERT3U(zfs_drop_nlink_locked(zp, tx, &unlinked), ==, 0); + error = zfs_drop_nlink_locked(zp, tx, &unlinked); + ASSERT3U(error, ==, 0); mutex_exit(&zp->z_lock); } else { error = zfs_dropname(dl, zp, dzp, tx, flag); diff --git a/module/os/linux/zfs/zfs_vnops.c b/module/os/linux/zfs/zfs_vnops.c index 272782206bd7..2a08d349d961 100644 --- a/module/os/linux/zfs/zfs_vnops.c +++ b/module/os/linux/zfs/zfs_vnops.c @@ -3671,24 +3671,18 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, int error = 0; int zflg = 0; boolean_t waited = B_FALSE; - uint64_t txtype; /* Needed for whiteout inode creation. */ vattr_t wo_vap; uint64_t wo_projid; boolean_t fuid_dirtied; zfs_acl_ids_t acl_ids; boolean_t have_acl = B_FALSE; + znode_t *wzp = NULL; + if (snm == NULL || tnm == NULL) return (SET_ERROR(EINVAL)); - if (flags & RENAME_EXCHANGE) - txtype = TX_EXCHANGE; - else if (flags & RENAME_WHITEOUT) - txtype = TX_WHITEOUT; - else - txtype = TX_RENAME; - ZFS_ENTER(zfsvfs); zilog = zfsvfs->z_log; @@ -3887,7 +3881,7 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, /* * Source and target must be the same type (unless exchanging). */ - if (txtype != TX_EXCHANGE) { + if (!(flags & RENAME_EXCHANGE)) { boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0; boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0; @@ -3905,15 +3899,14 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, error = 0; goto out; } - } - /* Target must exist for RENAME_EXCHANGE. */ - if (!tzp && txtype == TX_EXCHANGE) { + } else if (flags & RENAME_EXCHANGE) { + /* Target must exist for RENAME_EXCHANGE. */ error = SET_ERROR(ENOENT); goto out; } /* Set up inode creation for RENAME_WHITEOUT. */ - if (txtype == TX_WHITEOUT) { + if (flags & RENAME_WHITEOUT) { error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr); if (error) goto out; @@ -3924,6 +3917,8 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, error = zfs_acl_ids_create(sdzp, 0, &wo_vap, cr, NULL, &acl_ids); + /* zpl_vap_init increases reference on cr */ + crfree(cr); if (error) goto out; have_acl = B_TRUE; @@ -3937,7 +3932,7 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); - dmu_tx_hold_zap(tx, sdzp->z_id, txtype == TX_EXCHANGE, snm); + dmu_tx_hold_zap(tx, sdzp->z_id, !!(flags & RENAME_EXCHANGE), snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); if (sdzp != tdzp) { dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); @@ -3947,7 +3942,7 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tzp); } - if (txtype == TX_WHITEOUT) { + if (flags & RENAME_WHITEOUT) { dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); @@ -4012,7 +4007,7 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, if (tzp) { int tzflg = zflg; - if (txtype == TX_EXCHANGE) { + if (flags & RENAME_EXCHANGE) { /* This inode will be re-linked soon. */ tzflg |= ZRENAMING; @@ -4046,38 +4041,49 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, goto commit_link_tzp; } - switch (txtype) { - case TX_EXCHANGE: - error = zfs_link_create(sdl, tzp, tx, ZRENAMING); - /* - * The same argument as zfs_link_create() failing for - * szp applies here, since the source directory must - * have had an entry we are replacing. - */ - ASSERT3U(error, ==, 0); - if (error) - goto commit_unlink_td_szp; - break; - case TX_WHITEOUT: { - znode_t *wzp; - - zfs_mknode(sdzp, &wo_vap, tx, cr, 0, &wzp, &acl_ids); - error = zfs_link_create(sdl, wzp, tx, ZNEW); - if (error) { - zfs_znode_delete(wzp, tx); - remove_inode_hash(ZTOI(wzp)); - goto commit_unlink_td_szp; - } - /* No need to zfs_log_create_txtype here. */ + if (flags & RENAME_EXCHANGE) { + error = zfs_link_create(sdl, tzp, tx, ZRENAMING); + /* + * The same argument as zfs_link_create() failing for + * szp applies here, since the source directory must + * have had an entry we are replacing. + */ + ASSERT3U(error, ==, 0); + if (error) + goto commit_unlink_td_szp; + } else if (flags & RENAME_WHITEOUT) { + zfs_mknode(sdzp, &wo_vap, tx, cr, 0, &wzp, &acl_ids); + error = zfs_link_create(sdl, wzp, tx, ZNEW); + if (error) { + zfs_znode_delete(wzp, tx); + remove_inode_hash(ZTOI(wzp)); + goto commit_unlink_td_szp; } + /* No need to zfs_log_create_txtype here. */ } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); - zfs_log_rename(zilog, tx, txtype | - (flags & FIGNORECASE ? TX_CI : 0), sdzp, - sdl->dl_name, tdzp, tdl->dl_name, szp); + if (flags & RENAME_EXCHANGE) { + zfs_log_rename_exchange(zilog, tx, + (flags & FIGNORECASE ? TX_CI : 0), sdzp, + sdl->dl_name, tdzp, tdl->dl_name, szp); + } else if (flags & RENAME_WHITEOUT) { + vsecattr_t vsecp; + + vsecp.vsa_mask |= VSA_ACE_ALLTYPES; + error = zfs_getacl(szp, &vsecp, B_TRUE, cr); + + zfs_log_rename_whiteout(zilog, tx, + (flags & FIGNORECASE ? TX_CI : 0), sdzp, + sdl->dl_name, tdzp, tdl->dl_name, szp, wzp, + &vsecp, acl_ids.z_fuidp, &wo_vap); + } else { + zfs_log_rename(zilog, tx, + (flags & FIGNORECASE ? TX_CI : 0), sdzp, + sdl->dl_name, tdzp, tdl->dl_name, szp); + } commit: dmu_tx_commit(tx); @@ -4085,12 +4091,6 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, if (have_acl) zfs_acl_ids_free(&acl_ids); - if (zl != NULL) - zfs_rename_unlock(&zl); - - zfs_dirent_unlock(sdl); - zfs_dirent_unlock(tdl); - zfs_inode_update(sdzp); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); @@ -4100,11 +4100,21 @@ zfs_rename(struct inode *sdip, char *snm, struct inode *tdip, char *tnm, zfs_inode_update(szp); iput(ZTOI(szp)); + if (wzp) { + zfs_inode_update(wzp); + iput(ZTOI(wzp)); + } if (tzp) { zfs_inode_update(tzp); iput(ZTOI(tzp)); } + if (zl != NULL) + zfs_rename_unlock(&zl); + + zfs_dirent_unlock(sdl); + zfs_dirent_unlock(tdl); + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); diff --git a/module/os/linux/zfs/zpl_inode.c b/module/os/linux/zfs/zpl_inode.c index a88cd5df2f3e..37d0f8394c96 100644 --- a/module/os/linux/zfs/zpl_inode.c +++ b/module/os/linux/zfs/zpl_inode.c @@ -421,7 +421,7 @@ zpl_rename2(struct inode *sdip, struct dentry *sdentry, return (error); } -#ifndef HAVE_RENAME_WANTS_FLAGS +#if defined(HAVE_RENAME_WANTS_FLAGS) && !defined(HAVE_RENAME2) static int zpl_rename(struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry) @@ -690,11 +690,15 @@ const struct inode_operations zpl_dir_inode_operations = { .mkdir = zpl_mkdir, .rmdir = zpl_rmdir, .mknod = zpl_mknod, +#ifdef HAVE_RENAME2 + .rename2 = zpl_rename2, +#else /* not HAVE_RENAME2, still might accept flags */ #ifdef HAVE_RENAME_WANTS_FLAGS .rename = zpl_rename2, -#else +#else /* not HAVE_RENAME_WANTS_FLAGS, doesn't accept flags at all */ .rename = zpl_rename, -#endif +#endif /* HAVE_RENAME_WANTS_FLAGS */ +#endif /* HAVE_RENAME2 */ #ifdef HAVE_TMPFILE .tmpfile = zpl_tmpfile, #endif diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index 328faf30d5df..0580e8553cff 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -474,9 +474,7 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, } /* - * Handles TX_{RENAME,EXCHANGE,WHITEOUT} transactions. They all have the same - * underyling structure (lr_rename_t) but have different txtypes to indicate - * different renameat2(2) flags. + * Handles TX_RENAME transactions. */ void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, @@ -490,6 +488,7 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, if (zil_replaying(zilog, tx)) return; + txtype |= TX_RENAME; itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize); lr = (lr_rename_t *)&itx->itx_lr; lr->lr_sdoid = sdzp->z_id; @@ -501,6 +500,94 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, zil_itx_assign(zilog, itx, tx); } +/* + * At the moment, only Linux supports renameat2 variant of renameat, which + * adds three new flags of interest for us: + * RENAME_NOREPLACE: if the target name at the moment of the call exists, + * don't rewrite it and return error + * RENAME_EXCHANGE: atomically swap the two names on the filesystem + * RENAME_WHITEOUT: creates a whiteout inode in place of renamed file as + * an atomic operation + * + * Ideally, these operations should be represented as new ZFS Intent Log record + * types, which should mandate a new ZFS feature flag due to the on-disk format + * change. One would use spa_feature_incr/decr functions to indicate that we're + * actually actively using the new on-disk txtypes - but these functions are + * only supposed to be called from the txg syncing context. + * This means that would need to force out in-progress txg to disk and start + * a new one before writing any ZIL records, just so we can be sure that ZIL + * replaying ZFS gets told it should expect potentially incompatible ZIL + * txtypes. Doing this would hurt performance. + * Alternatively, we could just activate the feature on a pool when these + * renameat2 flags get first used and leave it at that - which would render the + * pool importable read-only on implementations without the new feature flag, + * even when no new txtypes IL records would be present on-disk - which on most + * setups could be 'almost' all the time, so it'd be a shame to have them all + * read-only on non-Linux platforms. + * As a third option, at least until more platforms implement renameat2, we + * choose to rely on the fact that the ZIL is replayed in single-threaded mode + * before the dataset is mounted. This way, we can represent the otherwise + * atomic operations as a series of plain good old txtypes known to all current + * OpenZFS implementations. To do that, we use these hacky functions: + * + * zfs_log_rename_exchange + * zfs_log_rename_whiteout + * + * To represent atomic rename with old non-atomic operations, we need + * a temporary new name; so we try picking a name until we succeed, then + * we get a dirent lock for that temp name until the final itx gets queued + */ + +void +zfs_log_rename_exchange(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp) +{ + zfs_dirlock_t *tmpdl; + znode_t *tmpzp = NULL; + char rndname[16]; + char *tmpname; + int retries = 0; + int error; + + tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + ASSERT3P(tmpname, !=, NULL); + + for (int i = 0; i < 16; i++) { + int r = 0xFF; + while (r > 127 || r == 0 || r == '/') + random_get_pseudo_bytes((void *)&r, 1); + rndname[i] = (char)r; + } + + do { + retries++; + (void) snprintf(tmpname, MAXPATHLEN, + "%s.zfs_renameat2_emul_%s%4d", + dname, rndname, retries); + error = zfs_dirent_lock(&tmpdl, tdzp, tmpname, + &tmpzp, ZNEW, NULL, NULL); + } while (error != 0 && retries != INT_MAX); + ASSERT3U(retries, !=, INT_MAX); + + zfs_log_rename(zilog, tx, txtype, sdzp, sname, tdzp, tmpname, szp); + zfs_log_rename(zilog, tx, txtype, tdzp, dname, sdzp, sname, szp); + zfs_log_rename(zilog, tx, txtype, tdzp, tmpname, tdzp, dname, szp); + + zfs_dirent_unlock(tmpdl); + kmem_free(tmpname, MAXPATHLEN); +} + +/* See comment above zfs_log_rename_exchange */ +void +zfs_log_rename_whiteout(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp, + znode_t *wzp, vsecattr_t *vsecp, zfs_fuid_info_t *fuidp, vattr_t *vap) +{ + zfs_log_rename(zilog, tx, txtype, sdzp, sname, tdzp, dname, szp); + txtype |= TX_CREATE; + zfs_log_create(zilog, tx, txtype, sdzp, wzp, sname, vsecp, fuidp, vap); +} + /* * zfs_log_write() handles TX_WRITE transactions. The specified callback is * called as soon as the write is on stable storage (be it via a DMU sync or a diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c index fd31417c102e..7dea85bb6614 100644 --- a/module/zfs/zfs_replay.c +++ b/module/zfs/zfs_replay.c @@ -625,7 +625,7 @@ zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap) } static int -_zfs_replay_renameat2(void *arg1, void *arg2, boolean_t byteswap, int vflg) +zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_rename_t *lr = arg2; @@ -633,6 +633,7 @@ _zfs_replay_renameat2(void *arg1, void *arg2, boolean_t byteswap, int vflg) char *tname = sname + strlen(sname) + 1; znode_t *sdzp, *tdzp; int error; + int vflg = 0; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -656,24 +657,6 @@ _zfs_replay_renameat2(void *arg1, void *arg2, boolean_t byteswap, int vflg) return (error); } -static int -zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap) -{ - return (_zfs_replay_renameat2(arg1, arg2, byteswap, 0)); -} - -static int -zfs_replay_exchange(void *arg1, void *arg2, boolean_t byteswap) -{ - return (_zfs_replay_renameat2(arg1, arg2, byteswap, RENAME_EXCHANGE)); -} - -static int -zfs_replay_whiteout(void *arg1, void *arg2, boolean_t byteswap) -{ - return (_zfs_replay_renameat2(arg1, arg2, byteswap, RENAME_WHITEOUT)); -} - static int zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap) { @@ -998,6 +981,4 @@ zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = { zfs_replay_create, /* TX_MKDIR_ATTR */ zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */ zfs_replay_write2, /* TX_WRITE2 */ - zfs_replay_exchange, /* TX_EXCHANGE */ - zfs_replay_whiteout, /* TX_WHITEOUT */ };