Skip to content

Commit

Permalink
Add zfs_trim_zero parameter.
Browse files Browse the repository at this point in the history
This patch adds a new module tunable: zfs_trim_zero. When this
parameter is set to 1, ZFS makes sure that DKIOCTRIM zeroes the range
being TRIMmed. This makes data corruption issues easier to reproduce,
at the cost of greatly decreased performance.

On disk vdevs, zfs_trim_zero replaces DISCARD with zero page writes,
which means it will work even with vdevs that do not support DISCARD.
Note that DISCARD won't be used even on vdevs that support it.

On file vdevs, if FALLOC_FL_PUNCH_HOLE fails and zfs_trim_zero=1, then
a zero buffer will be written instead. It is not necessary to write
zeroes if hole punching works because FALLOC_FL_PUNCH_HOLE specifies
that the specified range must appear as zeroes after hole punching.
  • Loading branch information
dechamps committed Sep 18, 2012
1 parent fbc1546 commit 37806f0
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 25 deletions.
1 change: 1 addition & 0 deletions include/sys/vdev.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ typedef enum vdev_dtl_type {

extern int zfs_nocacheflush;
extern int zfs_notrim;
extern int zfs_trim_zero;

extern int vdev_open(vdev_t *);
extern void vdev_open_children(vdev_t *);
Expand Down
12 changes: 12 additions & 0 deletions module/zfs/vdev.c
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,15 @@ static vdev_ops_t *vdev_ops_table[] = {
/* maximum scrub/resilver I/O queue per leaf vdev */
int zfs_scrub_limit = 10;

/*
* Make sure TRIM zeroes data.
*
* On disk vdevs, don't use DISCARD and write zero pages instead.
*
* On file vdevs, if hole punching fails, then write zeroes instead.
*/
int zfs_trim_zero = 0;

/*
* Given a vdev type, return the appropriate ops vector.
*/
Expand Down Expand Up @@ -3200,4 +3209,7 @@ EXPORT_SYMBOL(vdev_clear);

module_param(zfs_scrub_limit, int, 0644);
MODULE_PARM_DESC(zfs_scrub_limit, "Max scrub/resilver I/O per leaf vdev");

module_param(zfs_trim_zero, int, 0644);
MODULE_PARM_DESC(zfs_trim_zero, "Make sure TRIM zeroes data (only for debugging)");
#endif
35 changes: 24 additions & 11 deletions module/zfs/vdev_disk.c
Original file line number Diff line number Diff line change
Expand Up @@ -494,15 +494,22 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr,
if (flags & REQ_DISCARD) {
ASSERT(!kbuf_ptr);

if (!blk_queue_discard(q) ||
!q->limits.max_discard_sectors)
return EOPNOTSUPP;

max_discard_size = MIN(q->limits.max_discard_sectors << 9,
INT_MAX);
if (q->limits.discard_granularity)
max_discard_size &= ~(q->limits.discard_granularity - 1);
max_discard_size &= ~511;
if (zfs_trim_zero)
max_discard_size = PAGE_SIZE;
else {
if (!blk_queue_discard(q) ||
!q->limits.max_discard_sectors)
return EOPNOTSUPP;

max_discard_size = MIN(
q->limits.max_discard_sectors << 9,
INT_MAX);
if (q->limits.discard_granularity)
max_discard_size &=
~(q->limits.discard_granularity
- 1);
max_discard_size &= ~511;
}
}

ASSERT3U(kbuf_offset + kbuf_size, <=, bdev->bd_inode->i_size);
Expand Down Expand Up @@ -563,8 +570,14 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, caddr_t kbuf_ptr,
dr->dr_bio[i]->bi_private = dr;

if (flags & REQ_DISCARD) {
dr->dr_bio[i]->bi_size = MIN(bio_size,
max_discard_size);
if (zfs_trim_zero) {
dr->dr_bio[i]->bi_rw &= ~REQ_DISCARD;
bio_add_page(dr->dr_bio[i],
ZERO_PAGE(0), MIN(bio_size,
max_discard_size), 0);
} else
dr->dr_bio[i]->bi_size = MIN(bio_size,
max_discard_size);
bio_size -= dr->dr_bio[i]->bi_size;
} else {
/*
Expand Down
62 changes: 48 additions & 14 deletions module/zfs/vdev_file.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@
#include <sys/fs/zfs.h>
#include <sys/fm/fs/zfs.h>

#ifndef _KERNEL
static char empty_zero_page[4096];
#endif

/*
* Virtual device vector for files.
*/
Expand Down Expand Up @@ -137,13 +141,55 @@ vdev_file_close(vdev_t *vd)
vd->vdev_tsd = NULL;
}

static void vdev_file_trim(zio_t *zio)
{
struct flock fl;
uint64_t len;
ssize_t resid = 0;
vdev_t *vd = zio->io_vd;
vdev_file_t *vf = vd->vdev_tsd;

if (vd->vdev_notrim) {
zio->io_error = EOPNOTSUPP;
return;
}

bzero(&fl, sizeof(fl));
fl.l_type = F_WRLCK;
fl.l_whence = 0;
fl.l_start = zio->io_offset;
fl.l_len = zio->io_size;
zio->io_error = VOP_SPACE(vf->vf_vnode, F_FREESP, &fl,
FWRITE | FOFFMAX, zio->io_offset, kcred, NULL);

if (zfs_trim_zero && zio->io_error) {
while (fl.l_len > 0) {
len = MIN(fl.l_len, sizeof(empty_zero_page));

zio->io_error = vn_rdwr(UIO_WRITE, vf->vf_vnode,
empty_zero_page, len, fl.l_start,
UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred,
&resid);
if (resid != 0 && zio->io_error == 0)
zio->io_error = ENOSPC;
if (zio->io_error)
return;

fl.l_len -= len;
fl.l_start += len;
}
}

if (zio->io_error == EOPNOTSUPP)
vd->vdev_notrim = B_TRUE;
}

static int
vdev_file_io_start(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
vdev_file_t *vf;
ssize_t resid = 0;
struct flock fl;

if (!vdev_readable(vd)) {
zio->io_error = ENXIO;
Expand All @@ -159,19 +205,7 @@ vdev_file_io_start(zio_t *zio)
kcred, NULL);
break;
case DKIOCTRIM:
if (vd->vdev_notrim)
zio->io_error = EOPNOTSUPP;
else {
bzero(&fl, sizeof(fl));
fl.l_type = F_WRLCK;
fl.l_whence = 0;
fl.l_start = zio->io_offset;
fl.l_len = zio->io_size;
zio->io_error = VOP_SPACE(vf->vf_vnode, F_FREESP, &fl,
FWRITE | FOFFMAX, zio->io_offset, kcred, NULL);
if (zio->io_error == EOPNOTSUPP)
vd->vdev_notrim = B_TRUE;
}
vdev_file_trim(zio);
break;
default:
zio->io_error = ENOTSUP;
Expand Down

0 comments on commit 37806f0

Please sign in to comment.