diff --git a/include/sys/vdev_draid.h b/include/sys/vdev_draid.h index 52ce4ba16105..dd334acbacf1 100644 --- a/include/sys/vdev_draid.h +++ b/include/sys/vdev_draid.h @@ -96,6 +96,7 @@ extern boolean_t vdev_draid_readable(vdev_t *, uint64_t); extern boolean_t vdev_draid_missing(vdev_t *, uint64_t, uint64_t, uint64_t); extern uint64_t vdev_draid_asize_to_psize(vdev_t *, uint64_t); extern void vdev_draid_map_alloc_empty(zio_t *, struct raidz_row *); +extern int vdev_draid_map_verify_empty(zio_t *, struct raidz_row *); extern nvlist_t *vdev_draid_read_config_spare(vdev_t *); /* Functions for dRAID distributed spares. */ diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h index ee597eb0dbb3..c7cf0af6d945 100644 --- a/include/sys/vdev_raidz.h +++ b/include/sys/vdev_raidz.h @@ -32,6 +32,7 @@ extern "C" { #endif struct zio; +struct raidz_col; struct raidz_row; struct raidz_map; #if !defined(_KERNEL) @@ -49,6 +50,7 @@ void vdev_raidz_generate_parity(struct raidz_map *); void vdev_raidz_reconstruct(struct raidz_map *, const int *, int); void vdev_raidz_child_done(zio_t *); void vdev_raidz_io_done(zio_t *); +void vdev_raidz_checksum_error(zio_t *, struct raidz_col *, abd_t *); extern const zio_vsd_ops_t vdev_raidz_vsd_ops; diff --git a/lib/libshare/os/freebsd/nfs.c b/lib/libshare/os/freebsd/nfs.c index 56df3e66643c..97092bdc0fdf 100644 --- a/lib/libshare/os/freebsd/nfs.c +++ b/lib/libshare/os/freebsd/nfs.c @@ -423,9 +423,10 @@ nfs_commit_shares(void) struct pidfh *pfh; pid_t mountdpid; +start: pfh = pidfile_open(_PATH_MOUNTDPID, 0600, &mountdpid); if (pfh != NULL) { - /* Mountd is not running. */ + /* mountd(8) is not running. */ pidfile_remove(pfh); return (SA_OK); } @@ -433,6 +434,11 @@ nfs_commit_shares(void) /* Cannot open pidfile for some reason. */ return (SA_SYSTEM_ERR); } + if (mountdpid == -1) { + /* mountd(8) exists, but didn't write the PID yet */ + usleep(500); + goto start; + } /* We have mountd(8) PID in mountdpid variable. */ kill(mountdpid, SIGHUP); return (SA_OK); diff --git a/lib/libspl/include/sys/feature_tests.h b/lib/libspl/include/sys/feature_tests.h index a36fd7b8cffb..c9564b2c3269 100644 --- a/lib/libspl/include/sys/feature_tests.h +++ b/lib/libspl/include/sys/feature_tests.h @@ -30,7 +30,7 @@ #define ____cacheline_aligned #define __NORETURN __attribute__((__noreturn__)) -#if !defined(fallthrough) +#if !defined(fallthrough) && !defined(_LIBCPP_VERSION) #if defined(HAVE_IMPLICIT_FALLTHROUGH) #define fallthrough __attribute__((__fallthrough__)) #else diff --git a/lib/libspl/os/freebsd/mnttab.c b/lib/libspl/os/freebsd/mnttab.c index bd3e3e4e3eef..d830257fbd16 100644 --- a/lib/libspl/os/freebsd/mnttab.c +++ b/lib/libspl/os/freebsd/mnttab.c @@ -91,16 +91,28 @@ optadd(char *mntopts, size_t size, const char *opt) strlcat(mntopts, opt, size); } +static __thread char gfstypename[MFSNAMELEN]; +static __thread char gmntfromname[MNAMELEN]; +static __thread char gmntonname[MNAMELEN]; +static __thread char gmntopts[MNTMAXSTR]; + void statfs2mnttab(struct statfs *sfs, struct mnttab *mp) { - static char mntopts[MNTMAXSTR]; long flags; - mntopts[0] = '\0'; + strlcpy(gfstypename, sfs->f_fstypename, sizeof (gfstypename)); + mp->mnt_fstype = gfstypename; + + strlcpy(gmntfromname, sfs->f_mntfromname, sizeof (gmntfromname)); + mp->mnt_special = gmntfromname; + + strlcpy(gmntonname, sfs->f_mntonname, sizeof (gmntonname)); + mp->mnt_mountp = gmntonname; flags = sfs->f_flags; -#define OPTADD(opt) optadd(mntopts, sizeof (mntopts), (opt)) + gmntopts[0] = '\0'; +#define OPTADD(opt) optadd(gmntopts, sizeof (gmntopts), (opt)) if (flags & MNT_RDONLY) OPTADD(MNTOPT_RO); else @@ -121,10 +133,7 @@ statfs2mnttab(struct statfs *sfs, struct mnttab *mp) else OPTADD(MNTOPT_EXEC); #undef OPTADD - mp->mnt_special = strdup(sfs->f_mntfromname); - mp->mnt_mountp = strdup(sfs->f_mntonname); - mp->mnt_fstype = strdup(sfs->f_fstypename); - mp->mnt_mntopts = strdup(mntopts); + mp->mnt_mntopts = gmntopts; } static struct statfs *gsfs = NULL; @@ -166,7 +175,6 @@ statfs_init(void) int getmntany(FILE *fd __unused, struct mnttab *mgetp, struct mnttab *mrefp) { - // struct statfs *sfs; int i, error; error = statfs_init(); @@ -195,7 +203,6 @@ getmntany(FILE *fd __unused, struct mnttab *mgetp, struct mnttab *mrefp) int getmntent(FILE *fp, struct mnttab *mp) { - // struct statfs *sfs; int error, nfs; nfs = (int)lseek(fileno(fp), 0, SEEK_CUR); diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index 82580a0e29d2..ac5cfba6c7b6 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -2563,7 +2563,7 @@ zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, "progress thread exited nonzero"))); } - if (flags->props || flags->holds || flags->backup) { + if (err == 0 && (flags->props || flags->holds || flags->backup)) { /* Write the final end record. */ err = send_conclusion_record(fd, NULL); if (err != 0) diff --git a/lib/libzfs/os/linux/libzfs_mount_os.c b/lib/libzfs/os/linux/libzfs_mount_os.c index 21d64053862e..b800e069e707 100644 --- a/lib/libzfs/os/linux/libzfs_mount_os.c +++ b/lib/libzfs/os/linux/libzfs_mount_os.c @@ -327,7 +327,7 @@ do_mount(zfs_handle_t *zhp, const char *mntpt, char *opts, int flags) if (!libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) { char badopt[MNT_LINE_MAX] = {0}; - unsigned long mntflags = flags, zfsflags; + unsigned long mntflags = flags, zfsflags = 0; char myopts[MNT_LINE_MAX] = {0}; if (zfs_parse_mount_options(opts, &mntflags, diff --git a/module/os/freebsd/zfs/vdev_file.c b/module/os/freebsd/zfs/vdev_file.c index fc04a7476154..742ac9786d72 100644 --- a/module/os/freebsd/zfs/vdev_file.c +++ b/module/os/freebsd/zfs/vdev_file.c @@ -234,6 +234,7 @@ vdev_file_io_strategy(void *arg) err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid); abd_return_buf(zio->io_abd, buf, size); } + zio->io_error = err; if (resid != 0 && zio->io_error == 0) zio->io_error = ENOSPC; diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c index ed739103bed9..2a579f1ac7cf 100644 --- a/module/os/freebsd/zfs/zfs_vfsops.c +++ b/module/os/freebsd/zfs/zfs_vfsops.c @@ -1823,6 +1823,7 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) vnode_t *dvp; uint64_t object = 0; uint64_t fid_gen = 0; + uint64_t setgen = 0; uint64_t gen_mask; uint64_t zp_gen; int i, err; @@ -1838,7 +1839,6 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) { zfid_long_t *zlfid = (zfid_long_t *)fidp; uint64_t objsetid = 0; - uint64_t setgen = 0; for (i = 0; i < sizeof (zlfid->zf_setid); i++) objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); @@ -1867,6 +1867,12 @@ zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) return (SET_ERROR(EINVAL)); } + if (fidp->fid_len == LONG_FID_LEN && (fid_gen > 1 || setgen != 0)) { + dprintf("snapdir fid: fid_gen (%llu) and setgen (%llu)\n", + (u_longlong_t)fid_gen, (u_longlong_t)setgen); + return (SET_ERROR(EINVAL)); + } + /* * A zero fid_gen means we are in .zfs or the .zfs/snapshot * directory tree. If the object == zfsvfs->z_shares_dir, then diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index f3a1b144d215..145b08595a64 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -113,6 +113,12 @@ VFS_SMR_DECLARE; #define VNCHECKREF(vp) #endif +#if __FreeBSD_version >= 1400045 +typedef uint64_t cookie_t; +#else +typedef ulong_t cookie_t; +#endif + /* * Programming rules. * @@ -1679,7 +1685,7 @@ zfs_rmdir(znode_t *dzp, const char *name, znode_t *cwd, cred_t *cr, int flags) /* ARGSUSED */ static int zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, - int *ncookies, ulong_t **cookies) + int *ncookies, cookie_t **cookies) { znode_t *zp = VTOZ(vp); iovec_t *iovp; @@ -1701,7 +1707,7 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, boolean_t check_sysattrs; uint8_t type; int ncooks; - ulong_t *cooks = NULL; + cookie_t *cooks = NULL; int flags = 0; ZFS_ENTER(zfsvfs); @@ -1778,7 +1784,7 @@ zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp, */ ncooks = zfs_uio_resid(uio) / (sizeof (struct dirent) - sizeof (((struct dirent *)NULL)->d_name) + 1); - cooks = malloc(ncooks * sizeof (ulong_t), M_TEMP, M_WAITOK); + cooks = malloc(ncooks * sizeof (*cooks), M_TEMP, M_WAITOK); *cookies = cooks; *ncookies = ncooks; } @@ -4732,7 +4738,7 @@ struct vop_readdir_args { struct ucred *a_cred; int *a_eofflag; int *a_ncookies; - ulong_t **a_cookies; + cookie_t **a_cookies; }; #endif diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c index 450369192569..ffd96d159e36 100644 --- a/module/os/freebsd/zfs/zvol_os.c +++ b/module/os/freebsd/zfs/zvol_os.c @@ -210,7 +210,6 @@ zvol_geom_open(struct g_provider *pp, int flag, int count) zvol_state_t *zv; int err = 0; boolean_t drop_suspend = B_FALSE; - boolean_t drop_namespace = B_FALSE; if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) { /* @@ -226,6 +225,12 @@ zvol_geom_open(struct g_provider *pp, int flag, int count) retry: rw_enter(&zvol_state_lock, ZVOL_RW_READER); + /* + * Obtain a copy of private under zvol_state_lock to make sure either + * the result of zvol free code setting private to NULL is observed, + * or the zv is protected from being freed because of the positive + * zv_open_count. + */ zv = pp->private; if (zv == NULL) { rw_exit(&zvol_state_lock); @@ -233,18 +238,6 @@ zvol_geom_open(struct g_provider *pp, int flag, int count) goto out_locked; } - if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) { - /* - * We need to guarantee that the namespace lock is held - * to avoid spurious failures in zvol_first_open. - */ - drop_namespace = B_TRUE; - if (!mutex_tryenter(&spa_namespace_lock)) { - rw_exit(&zvol_state_lock); - mutex_enter(&spa_namespace_lock); - goto retry; - } - } mutex_enter(&zv->zv_state_lock); if (zv->zv_zso->zso_dying) { rw_exit(&zvol_state_lock); @@ -276,8 +269,27 @@ zvol_geom_open(struct g_provider *pp, int flag, int count) ASSERT(MUTEX_HELD(&zv->zv_state_lock)); if (zv->zv_open_count == 0) { + boolean_t drop_namespace = B_FALSE; + ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); + + /* + * Take spa_namespace_lock to prevent lock inversion when + * zvols from one pool are opened as vdevs in another. + */ + if (!mutex_owned(&spa_namespace_lock)) { + if (!mutex_tryenter(&spa_namespace_lock)) { + mutex_exit(&zv->zv_state_lock); + rw_exit(&zv->zv_suspend_lock); + kern_yield(PRI_USER); + goto retry; + } else { + drop_namespace = B_TRUE; + } + } err = zvol_first_open(zv, !(flag & FWRITE)); + if (drop_namespace) + mutex_exit(&spa_namespace_lock); if (err) goto out_zv_locked; pp->mediasize = zv->zv_volsize; @@ -285,6 +297,8 @@ zvol_geom_open(struct g_provider *pp, int flag, int count) pp->stripesize = zv->zv_volblocksize; } + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + /* * Check for a bad on-disk format version now since we * lied about owning the dataset readonly before. @@ -317,8 +331,6 @@ zvol_geom_open(struct g_provider *pp, int flag, int count) out_zv_locked: mutex_exit(&zv->zv_state_lock); out_locked: - if (drop_namespace) - mutex_exit(&spa_namespace_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); return (err); @@ -859,10 +871,15 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) struct zvol_state_dev *zsd; int err = 0; boolean_t drop_suspend = B_FALSE; - boolean_t drop_namespace = B_FALSE; retry: rw_enter(&zvol_state_lock, ZVOL_RW_READER); + /* + * Obtain a copy of si_drv2 under zvol_state_lock to make sure either + * the result of zvol free code setting si_drv2 to NULL is observed, + * or the zv is protected from being freed because of the positive + * zv_open_count. + */ zv = dev->si_drv2; if (zv == NULL) { rw_exit(&zvol_state_lock); @@ -870,20 +887,12 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) goto out_locked; } - if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) { - /* - * We need to guarantee that the namespace lock is held - * to avoid spurious failures in zvol_first_open. - */ - drop_namespace = B_TRUE; - if (!mutex_tryenter(&spa_namespace_lock)) { - rw_exit(&zvol_state_lock); - mutex_enter(&spa_namespace_lock); - goto retry; - } - } mutex_enter(&zv->zv_state_lock); - + if (zv->zv_zso->zso_dying) { + rw_exit(&zvol_state_lock); + err = SET_ERROR(ENXIO); + goto out_zv_locked; + } ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); /* @@ -909,12 +918,33 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) ASSERT(MUTEX_HELD(&zv->zv_state_lock)); if (zv->zv_open_count == 0) { + boolean_t drop_namespace = B_FALSE; + ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); + + /* + * Take spa_namespace_lock to prevent lock inversion when + * zvols from one pool are opened as vdevs in another. + */ + if (!mutex_owned(&spa_namespace_lock)) { + if (!mutex_tryenter(&spa_namespace_lock)) { + rw_exit(&zvol_state_lock); + mutex_enter(&spa_namespace_lock); + kern_yield(PRI_USER); + goto retry; + } else { + drop_namespace = B_TRUE; + } + } err = zvol_first_open(zv, !(flags & FWRITE)); + if (drop_namespace) + mutex_exit(&spa_namespace_lock); if (err) goto out_zv_locked; } + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { err = SET_ERROR(EROFS); goto out_opened; @@ -949,8 +979,6 @@ zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) out_zv_locked: mutex_exit(&zv->zv_state_lock); out_locked: - if (drop_namespace) - mutex_exit(&spa_namespace_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); return (err); diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index 9d0d9de16a88..46a657cbea94 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -1295,6 +1295,11 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects) *objects = 0; for_each_online_node(sc.nid) { *objects += (*shrinker->scan_objects)(shrinker, &sc); + /* + * reset sc.nr_to_scan, modified by + * scan_objects == super_cache_scan + */ + sc.nr_to_scan = nr_to_scan; } } else { *objects = (*shrinker->scan_objects)(shrinker, &sc); diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 44caadd587f7..69479b3f7988 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -496,8 +496,7 @@ zvol_open(struct block_device *bdev, fmode_t flag) { zvol_state_t *zv; int error = 0; - boolean_t drop_suspend = B_TRUE; - boolean_t drop_namespace = B_FALSE; + boolean_t drop_suspend = B_FALSE; #ifndef HAVE_BLKDEV_GET_ERESTARTSYS hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); hrtime_t start = gethrtime(); @@ -517,7 +516,36 @@ zvol_open(struct block_device *bdev, fmode_t flag) return (SET_ERROR(-ENXIO)); } - if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) { + mutex_enter(&zv->zv_state_lock); + /* + * Make sure zvol is not suspended during first open + * (hold zv_suspend_lock) and respect proper lock acquisition + * ordering - zv_suspend_lock before zv_state_lock + */ + if (zv->zv_open_count == 0) { + if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { + mutex_exit(&zv->zv_state_lock); + rw_enter(&zv->zv_suspend_lock, RW_READER); + mutex_enter(&zv->zv_state_lock); + /* check to see if zv_suspend_lock is needed */ + if (zv->zv_open_count != 0) { + rw_exit(&zv->zv_suspend_lock); + } else { + drop_suspend = B_TRUE; + } + } else { + drop_suspend = B_TRUE; + } + } + rw_exit(&zvol_state_lock); + + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + if (zv->zv_open_count == 0) { + boolean_t drop_namespace = B_FALSE; + + ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); + /* * In all other call paths the spa_namespace_lock is taken * before the bdev->bd_mutex lock. However, on open(2) @@ -542,84 +570,51 @@ zvol_open(struct block_device *bdev, fmode_t flag) * the kernel so the only option is to return the error for * the caller to handle it. */ - if (!mutex_tryenter(&spa_namespace_lock)) { - rw_exit(&zvol_state_lock); + if (!mutex_owned(&spa_namespace_lock)) { + if (!mutex_tryenter(&spa_namespace_lock)) { + mutex_exit(&zv->zv_state_lock); + rw_exit(&zv->zv_suspend_lock); #ifdef HAVE_BLKDEV_GET_ERESTARTSYS - schedule(); - return (SET_ERROR(-ERESTARTSYS)); -#else - if ((gethrtime() - start) > timeout) + schedule(); return (SET_ERROR(-ERESTARTSYS)); +#else + if ((gethrtime() - start) > timeout) + return (SET_ERROR(-ERESTARTSYS)); - schedule_timeout(MSEC_TO_TICK(10)); - goto retry; + schedule_timeout(MSEC_TO_TICK(10)); + goto retry; #endif - } else { - drop_namespace = B_TRUE; - } - } - - mutex_enter(&zv->zv_state_lock); - /* - * make sure zvol is not suspended during first open - * (hold zv_suspend_lock) and respect proper lock acquisition - * ordering - zv_suspend_lock before zv_state_lock - */ - if (zv->zv_open_count == 0) { - if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { - mutex_exit(&zv->zv_state_lock); - rw_enter(&zv->zv_suspend_lock, RW_READER); - mutex_enter(&zv->zv_state_lock); - /* check to see if zv_suspend_lock is needed */ - if (zv->zv_open_count != 0) { - rw_exit(&zv->zv_suspend_lock); - drop_suspend = B_FALSE; + } else { + drop_namespace = B_TRUE; } } - } else { - drop_suspend = B_FALSE; - } - rw_exit(&zvol_state_lock); - - ASSERT(MUTEX_HELD(&zv->zv_state_lock)); - if (zv->zv_open_count == 0) { - ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); error = -zvol_first_open(zv, !(flag & FMODE_WRITE)); - if (error) - goto out_mutex; - } - if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) { - error = -EROFS; - goto out_open_count; + if (drop_namespace) + mutex_exit(&spa_namespace_lock); } - zv->zv_open_count++; - - mutex_exit(&zv->zv_state_lock); - if (drop_namespace) - mutex_exit(&spa_namespace_lock); - if (drop_suspend) - rw_exit(&zv->zv_suspend_lock); - - zfs_check_media_change(bdev); - - return (0); + if (error == 0) { + if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) { + if (zv->zv_open_count == 0) + zvol_last_close(zv); -out_open_count: - if (zv->zv_open_count == 0) - zvol_last_close(zv); + error = SET_ERROR(-EROFS); + } else { + zv->zv_open_count++; + } + } -out_mutex: mutex_exit(&zv->zv_state_lock); - if (drop_namespace) - mutex_exit(&spa_namespace_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); - return (SET_ERROR(error)); + if (error == 0) + zfs_check_media_change(bdev); + + return (error); } static void diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index b8f82d52e8f0..2d83c1ac977c 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -841,6 +841,53 @@ vdev_draid_map_alloc_empty(zio_t *zio, raidz_row_t *rr) ASSERT3U(skip_off, ==, rr->rr_nempty * skip_size); } +/* + * Verify that all empty sectors are zero filled before using them to + * calculate parity. Otherwise, silent corruption in an empty sector will + * result in bad parity being generated. That bad parity will then be + * considered authoritative and overwrite the good parity on disk. This + * is possible because the checksum is only calculated over the data, + * thus it cannot be used to detect damage in empty sectors. + */ +int +vdev_draid_map_verify_empty(zio_t *zio, raidz_row_t *rr) +{ + uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift; + uint64_t parity_size = rr->rr_col[0].rc_size; + uint64_t skip_off = parity_size - skip_size; + uint64_t empty_off = 0; + int ret = 0; + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + ASSERT3P(rr->rr_abd_empty, !=, NULL); + ASSERT3U(rr->rr_bigcols, >, 0); + + void *zero_buf = kmem_zalloc(skip_size, KM_SLEEP); + + for (int c = rr->rr_bigcols; c < rr->rr_cols; c++) { + raidz_col_t *rc = &rr->rr_col[c]; + + ASSERT3P(rc->rc_abd, !=, NULL); + ASSERT3U(rc->rc_size, ==, parity_size); + + if (abd_cmp_buf_off(rc->rc_abd, zero_buf, skip_off, + skip_size) != 0) { + vdev_raidz_checksum_error(zio, rc, rc->rc_abd); + abd_zero_off(rc->rc_abd, skip_off, skip_size); + rc->rc_error = SET_ERROR(ECKSUM); + ret++; + } + + empty_off += skip_size; + } + + ASSERT3U(empty_off, ==, abd_get_size(rr->rr_abd_empty)); + + kmem_free(zero_buf, skip_size); + + return (ret); +} + /* * Given a logical address within a dRAID configuration, return the physical * address on the first drive in the group that this address maps to diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c index 1feebf7089b4..9a7cf665643c 100644 --- a/module/zfs/vdev_raidz.c +++ b/module/zfs/vdev_raidz.c @@ -1654,8 +1654,8 @@ vdev_raidz_io_start(zio_t *zio) /* * Report a checksum error for a child of a RAID-Z device. */ -static void -raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) +void +vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) { vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; @@ -1725,6 +1725,13 @@ raidz_parity_verify(zio_t *zio, raidz_row_t *rr) abd_copy(orig[c], rc->rc_abd, rc->rc_size); } + /* + * Verify any empty sectors are zero filled to ensure the parity + * is calculated correctly even if these non-data sectors are damaged. + */ + if (rr->rr_nempty && rr->rr_abd_empty != NULL) + ret += vdev_draid_map_verify_empty(zio, rr); + /* * Regenerates parity even for !tried||rc_error!=0 columns. This * isn't harmful but it does have the side effect of fixing stuff @@ -1739,7 +1746,7 @@ raidz_parity_verify(zio_t *zio, raidz_row_t *rr) continue; if (abd_cmp(orig[c], rc->rc_abd) != 0) { - raidz_checksum_error(zio, rc, orig[c]); + vdev_raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = SET_ERROR(ECKSUM); ret++; } @@ -1799,7 +1806,6 @@ vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) (zio->io_flags & ZIO_FLAG_RESILVER)) { int n = raidz_parity_verify(zio, rr); unexpected_errors += n; - ASSERT3U(parity_errors + n, <=, rr->rr_firstdatacol); } if (zio->io_error == 0 && spa_writeable(zio->io_spa) && @@ -1925,7 +1931,7 @@ raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) */ if (rc->rc_error == 0 && c >= rr->rr_firstdatacol) { - raidz_checksum_error(zio, + vdev_raidz_checksum_error(zio, rc, rc->rc_orig_data); rc->rc_error = SET_ERROR(ECKSUM); diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 170e392abe93..54749810d45d 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -323,7 +323,7 @@ zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) int zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) { - int error = 0; + int error = 0, error1; ssize_t start_resid = zfs_uio_resid(uio); /* @@ -561,7 +561,11 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) continue; } #endif - if (error != 0) { + /* + * On FreeBSD, EFAULT should be propagated back to the + * VFS, which will handle faulting and will retry. + */ + if (error != 0 && error != EFAULT) { dmu_tx_commit(tx); break; } @@ -645,7 +649,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) while ((end_size = zp->z_size) < zfs_uio_offset(uio)) { (void) atomic_cas_64(&zp->z_size, end_size, zfs_uio_offset(uio)); - ASSERT(error == 0); + ASSERT(error == 0 || error == EFAULT); } /* * If we are replaying and eof is non zero then force @@ -655,7 +659,10 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) zp->z_size = zfsvfs->z_replay_eof; - error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + error1 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + if (error1 != 0) + /* Avoid clobbering EFAULT. */ + error = error1; zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag, NULL, NULL); diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_errors.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_errors.ksh index 4645e245c973..a6833f167c66 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_errors.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_errors.ksh @@ -28,11 +28,12 @@ # in zpool status. # # STRATEGY: -# 1. Create a raidz or mirror pool +# 1. Create a mirror, raidz, or draid pool # 2. Inject read/write IO errors or checksum errors # 3. Verify the number of errors in zpool status match the corresponding # number of error events. -# 4. Repeat for all combinations of raidz/mirror and io/checksum errors. +# 4. Repeat for all combinations of mirror/raidz/draid and io/checksum +# errors. # . $STF_SUITE/include/libtest.shlib @@ -74,7 +75,7 @@ log_must mkdir -p $MOUNTDIR # Run error test on a specific type of pool # -# $1: pool - raidz, mirror +# $1: pool - mirror, raidz, draid # $2: test type - corrupt (checksum error), io # $3: read, write function do_test @@ -142,8 +143,8 @@ function do_test log_must zpool destroy $POOL } -# Test all types of errors on mirror and raidz pools -for pooltype in mirror raidz ; do +# Test all types of errors on mirror, raidz, and draid pools +for pooltype in mirror raidz draid; do do_test $pooltype corrupt read do_test $pooltype io read do_test $pooltype io write