From 37806f0508423ab6388ddbfbff186580932427b8 Mon Sep 17 00:00:00 2001 From: Etienne Dechamps Date: Tue, 18 Sep 2012 13:25:02 +0200 Subject: [PATCH] Add zfs_trim_zero parameter. This patch adds a new module tunable: zfs_trim_zero. When this parameter is set to 1, ZFS makes sure that DKIOCTRIM zeroes the range being TRIMmed. This makes data corruption issues easier to reproduce, at the cost of greatly decreased performance. On disk vdevs, zfs_trim_zero replaces DISCARD with zero page writes, which means it will work even with vdevs that do not support DISCARD. Note that DISCARD won't be used even on vdevs that support it. On file vdevs, if FALLOC_FL_PUNCH_HOLE fails and zfs_trim_zero=1, then a zero buffer will be written instead. It is not necessary to write zeroes if hole punching works because FALLOC_FL_PUNCH_HOLE specifies that the specified range must appear as zeroes after hole punching. --- include/sys/vdev.h | 1 + module/zfs/vdev.c | 12 ++++++++ module/zfs/vdev_disk.c | 35 ++++++++++++++++-------- module/zfs/vdev_file.c | 62 ++++++++++++++++++++++++++++++++---------- 4 files changed, 85 insertions(+), 25 deletions(-) diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 5885d50cc489..35a29ed095dc 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -46,6 +46,7 @@ typedef enum vdev_dtl_type { extern int zfs_nocacheflush; extern int zfs_notrim; +extern int zfs_trim_zero; extern int vdev_open(vdev_t *); extern void vdev_open_children(vdev_t *); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 6b9aa733bcd8..3c399ae030df 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -64,6 +64,15 @@ static vdev_ops_t *vdev_ops_table[] = { /* maximum scrub/resilver I/O queue per leaf vdev */ int zfs_scrub_limit = 10; +/* + * Make sure TRIM zeroes data. + * + * On disk vdevs, don't use DISCARD and write zero pages instead. + * + * On file vdevs, if hole punching fails, then write zeroes instead. + */ +int zfs_trim_zero = 0; + /* * Given a vdev type, return the appropriate ops vector. */ @@ -3200,4 +3209,7 @@ EXPORT_SYMBOL(vdev_clear); module_param(zfs_scrub_limit, int, 0644); MODULE_PARM_DESC(zfs_scrub_limit, "Max scrub/resilver I/O per leaf vdev"); + +module_param(zfs_trim_zero, int, 0644); +MODULE_PARM_DESC(zfs_trim_zero, "Make sure TRIM zeroes data (only for debugging)"); #endif diff --git a/module/zfs/vdev_disk.c b/module/zfs/vdev_disk.c index 6edabcd6c45d..dbab8992f613 100644 --- a/module/zfs/vdev_disk.c +++ b/module/zfs/vdev_disk.c @@ -494,15 +494,22 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, if (flags & REQ_DISCARD) { ASSERT(!kbuf_ptr); - if (!blk_queue_discard(q) || - !q->limits.max_discard_sectors) - return EOPNOTSUPP; - - max_discard_size = MIN(q->limits.max_discard_sectors << 9, - INT_MAX); - if (q->limits.discard_granularity) - max_discard_size &= ~(q->limits.discard_granularity - 1); - max_discard_size &= ~511; + if (zfs_trim_zero) + max_discard_size = PAGE_SIZE; + else { + if (!blk_queue_discard(q) || + !q->limits.max_discard_sectors) + return EOPNOTSUPP; + + max_discard_size = MIN( + q->limits.max_discard_sectors << 9, + INT_MAX); + if (q->limits.discard_granularity) + max_discard_size &= + ~(q->limits.discard_granularity + - 1); + max_discard_size &= ~511; + } } ASSERT3U(kbuf_offset + kbuf_size, <=, bdev->bd_inode->i_size); @@ -563,8 +570,14 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr, dr->dr_bio[i]->bi_private = dr; if (flags & REQ_DISCARD) { - dr->dr_bio[i]->bi_size = MIN(bio_size, - max_discard_size); + if (zfs_trim_zero) { + dr->dr_bio[i]->bi_rw &= ~REQ_DISCARD; + bio_add_page(dr->dr_bio[i], + ZERO_PAGE(0), MIN(bio_size, + max_discard_size), 0); + } else + dr->dr_bio[i]->bi_size = MIN(bio_size, + max_discard_size); bio_size -= dr->dr_bio[i]->bi_size; } else { /* diff --git a/module/zfs/vdev_file.c b/module/zfs/vdev_file.c index 9ff6b6e80c1f..57ca54535b4e 100644 --- a/module/zfs/vdev_file.c +++ b/module/zfs/vdev_file.c @@ -30,6 +30,10 @@ #include #include +#ifndef _KERNEL +static char empty_zero_page[4096]; +#endif + /* * Virtual device vector for files. */ @@ -137,13 +141,55 @@ vdev_file_close(vdev_t *vd) vd->vdev_tsd = NULL; } +static void vdev_file_trim(zio_t *zio) +{ + struct flock fl; + uint64_t len; + ssize_t resid = 0; + vdev_t *vd = zio->io_vd; + vdev_file_t *vf = vd->vdev_tsd; + + if (vd->vdev_notrim) { + zio->io_error = EOPNOTSUPP; + return; + } + + bzero(&fl, sizeof(fl)); + fl.l_type = F_WRLCK; + fl.l_whence = 0; + fl.l_start = zio->io_offset; + fl.l_len = zio->io_size; + zio->io_error = VOP_SPACE(vf->vf_vnode, F_FREESP, &fl, + FWRITE | FOFFMAX, zio->io_offset, kcred, NULL); + + if (zfs_trim_zero && zio->io_error) { + while (fl.l_len > 0) { + len = MIN(fl.l_len, sizeof(empty_zero_page)); + + zio->io_error = vn_rdwr(UIO_WRITE, vf->vf_vnode, + empty_zero_page, len, fl.l_start, + UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, + &resid); + if (resid != 0 && zio->io_error == 0) + zio->io_error = ENOSPC; + if (zio->io_error) + return; + + fl.l_len -= len; + fl.l_start += len; + } + } + + if (zio->io_error == EOPNOTSUPP) + vd->vdev_notrim = B_TRUE; +} + static int vdev_file_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_file_t *vf; ssize_t resid = 0; - struct flock fl; if (!vdev_readable(vd)) { zio->io_error = ENXIO; @@ -159,19 +205,7 @@ vdev_file_io_start(zio_t *zio) kcred, NULL); break; case DKIOCTRIM: - if (vd->vdev_notrim) - zio->io_error = EOPNOTSUPP; - else { - bzero(&fl, sizeof(fl)); - fl.l_type = F_WRLCK; - fl.l_whence = 0; - fl.l_start = zio->io_offset; - fl.l_len = zio->io_size; - zio->io_error = VOP_SPACE(vf->vf_vnode, F_FREESP, &fl, - FWRITE | FOFFMAX, zio->io_offset, kcred, NULL); - if (zio->io_error == EOPNOTSUPP) - vd->vdev_notrim = B_TRUE; - } + vdev_file_trim(zio); break; default: zio->io_error = ENOTSUP;