Skip to content

Commit

Permalink
Change ZVOL to use _by_dnode() methods (#240)
Browse files Browse the repository at this point in the history
* Make zvol operations use _by_dnode routines

This continues what was started in
0eef1bde31d67091d3deed23fe2394f5a8bf2276 by fully converting zvols
to avoid unnecessary dnode_hold() calls. This saves a small amount
of CPU time and slightly improves latencies of operations on zvols.

Reviewed-by: Brian Behlendorf <[email protected]>
Signed-off-by: Richard Yao <[email protected]>
Closes #6058

Porting-notes:

Cleaned up previous style, which was based on OSX. Since we can use
the standard uio call, we should take advantage of that.

Additionally deleted all the symlink code, as that is specific to
OSX.

Ported-by: Jorgen Lundman <[email protected]>

* zvol_write() can use dmu_tx_hold_write_by_dnode()

We can improve the performance of writes to zvols by using
dmu_tx_hold_write_by_dnode() instead of dmu_tx_hold_write().  This
reduces lock contention on the first block of the dnode object, and also
reduces the amount of CPU needed.  The benefit will be highest with
multi-threaded async writes (i.e. writes that don't call zil_commit()).

Signed-off-by: Matthew Ahrens <[email protected]>

* Warn if volblocksize is smaller than async

$ ./zfs create -s -V 3GB -b 2048 tank/vol
Warning: volblocksize (2048) < ashift (12 / 4096)
means all writes are amplified and space wasted.

* Only sync zvol if sync=always

Co-authored-by: Richard Yao <[email protected]>
Co-authored-by: Matthew Ahrens <[email protected]>
  • Loading branch information
3 people authored May 13, 2020
1 parent 271ea63 commit fb5cd5b
Show file tree
Hide file tree
Showing 6 changed files with 176 additions and 614 deletions.
13 changes: 5 additions & 8 deletions ZFSin/zfs/include/sys/dmu.h
Original file line number Diff line number Diff line change
Expand Up @@ -837,25 +837,22 @@ void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
//#include <linux/blkdev_compat.h>
struct iomem;

int dmu_read_win(objset_t *os, uint64_t object, uint64_t *offset,
uint64_t position, uint64_t *size, void *iomem);
int dmu_read_win_dbuf(dmu_buf_t *zdb, uint64_t object, uint64_t *offset,
uint64_t position, uint64_t *size, void *iomem);
int dmu_read_req(objset_t *os, uint64_t object, struct request *req);
int dmu_write_req(objset_t *os, uint64_t object, struct request *req,
dmu_tx_t *tx);
dmu_tx_t *tx);
int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size);
int dmu_read_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size);
int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
dmu_tx_t *tx);
int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size,
dmu_tx_t *tx);
int dmu_write_win_dbuf(dmu_buf_t *zdb, uint64_t *offset, uint64_t position,
uint64_t *size, void *iomem, dmu_tx_t *tx);
int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp);
uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp);
int dmu_allocate_check(objset_t *z_os, off_t length);
struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db);
int dmu_write_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size,
dmu_tx_t *tx);

#endif
struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
Expand Down
13 changes: 3 additions & 10 deletions ZFSin/zfs/include/sys/zvol.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,9 @@ typedef struct zvol_state {
uint32_t zv_total_opens; /* total open count */
zilog_t *zv_zilog; /* ZIL handle */
rangelock_t zv_rangelock; /* for range locking */
dnode_t *zv_dn; /* dnode hold */
list_t zv_extents; /* List of extents for dump */
dmu_buf_t *zv_dbuf; /* bonus handle */
zvol_iokit_t *zv_iokitdev; /* IOKit device */
uint64_t zv_openflags; /* Remember flags used at open */
char zv_bsdname[MAXPATHLEN];
/* 'rdiskX' name, use [1] for diskX */
uint8_t zv_target_id;
uint8_t zv_lun_id;
} zvol_state_t;
Expand Down Expand Up @@ -128,8 +125,6 @@ extern void *zvol_tag(zvol_state_t *);

extern int zvol_open(dev_t dev, int flag, int otyp, struct proc *p);
extern int zvol_close(dev_t dev, int flag, int otyp, struct proc *p);
extern int zvol_read(dev_t dev, struct uio *uiop, int p);
extern int zvol_write(dev_t dev, struct uio *uiop, int p);

extern zvol_state_t *zvol_targetlun_lookup(uint8_t target, uint8_t lun);

Expand All @@ -151,11 +146,9 @@ extern int zvol_close_impl(zvol_state_t *zv, int flag,

extern int zvol_get_volume_blocksize(dev_t dev);

extern int zvol_read_win(zvol_state_t *zv, uint64_t offset,
uint64_t count, void *iomem);
extern int zvol_read(zvol_state_t *zv, uio_t *uio);
extern int zvol_write(zvol_state_t *zv, uio_t *uio);

extern int zvol_write_win(zvol_state_t *zv, uint64_t offset,
uint64_t count, void *iomem);
extern int zvol_unmap(zvol_state_t *zv, uint64_t off, uint64_t bytes);

extern void zvol_add_symlink(zvol_state_t *zv, const char *bsd_disk,
Expand Down
11 changes: 11 additions & 0 deletions ZFSin/zfs/lib/libzfs/libzfs_dataset.c
Original file line number Diff line number Diff line change
Expand Up @@ -3802,6 +3802,9 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type,
zpool_close(zpool_handle);
return (-1);
}

// Look up ashift for later, in case we need it.
uint64_t ashift = zpool_get_prop_int(zpool_handle, ZPOOL_PROP_ASHIFT, NULL);
zpool_close(zpool_handle);

if (type == ZFS_TYPE_VOLUME) {
Expand Down Expand Up @@ -3848,6 +3851,14 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type,
"size"));
return (zfs_error(hdl, EZFS_BADPROP, errbuf));
}

if ((ashift > 0ULL) &&
(blocksize < (1ULL << ashift))) {
(void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: "
"volblocksize (%llu) < ashift (%llu / %llu)\n"
"means all writes are amplified and space wasted.\n"),
blocksize, ashift, 1ULL << ashift);
}
}

(void) parent_name(path, parent, sizeof (parent));
Expand Down
235 changes: 12 additions & 223 deletions ZFSin/zfs/module/zfs/dmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -1415,8 +1415,7 @@ xuio_stat_wbuf_nocopy(void)
}

#ifdef _KERNEL

static int
int
dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size)
{
dmu_buf_t **dbp;
Expand Down Expand Up @@ -1511,25 +1510,24 @@ dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size)
int
dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
{
dnode_t *dn;
int err;
dnode_t *dn;
int err;

if (size == 0)
return (0);
if (size == 0)
return (0);

err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);

err = dmu_read_uio_dnode(dn, uio, size);
err = dmu_read_uio_dnode(dn, uio, size);

dnode_rele(dn, FTAG);
dnode_rele(dn, FTAG);

return (err);
return (err);
}


static int
int
dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
{
dmu_buf_t **dbp;
Expand Down Expand Up @@ -1636,215 +1634,6 @@ dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
return (err);
}

/*
* Support function for Win, iomem is an void* passed back
*/
int
dmu_read_win(objset_t *os, uint64_t object, uint64_t *offset,
uint64_t position, uint64_t *size, void *iomem)
{
dmu_buf_t **dbp;
int numbufs, i, err = 0;
/*
* NB: we could do this block-at-a-time, but it's nice
* to be reading in parallel.
*/
//err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG,
// &numbufs, &dbp);
err = dmu_buf_hold_array(os, object, position+*offset, *size, TRUE, FTAG,
&numbufs, &dbp);
if (err)
return (err);

while (*size > 0) {

for (i = 0; i < numbufs; i++) {
int tocpy;
int bufoff;
dmu_buf_t *db = dbp[i];
uint64_t done=0;

ASSERT(size > 0);

//bufoff = uio->uio_loffset - db->db_offset;
bufoff = (position+*offset) - db->db_offset;
tocpy = (int)MIN(db->db_size - bufoff, *size);

/*
err = uiomove((char *)db->db_data + bufoff, tocpy,
UIO_READ, uio);
*/
#if __APPLE__
done = zvolIO_kit_read(iomem,
*offset,
(char *)db->db_data + bufoff,
tocpy);
#endif
RtlMoveMemory((void *)((uintptr_t)iomem + *offset),
(void *)((uintptr_t)db->db_data + bufoff),
tocpy);
done = tocpy;

if (done > 0) {
(*offset) += done;
(*size) -= done;
}

//size -= tocpy;
}
}
dmu_buf_rele_array(dbp, numbufs, FTAG);

return (err);
}

int
dmu_read_win_dbuf(dmu_buf_t *zdb, uint64_t object, uint64_t *offset,
uint64_t position, uint64_t *size, void *iomem)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
dnode_t *dn;
int err;

if (*size == 0)
return (0);

DB_DNODE_ENTER(db);
dn = DB_DNODE(db);


dmu_buf_t **dbp;
int numbufs, i;

/*
* NB: we could do this block-at-a-time, but it's nice
* to be reading in parallel.
*/
err = dmu_buf_hold_array_by_dnode(dn, (position+*offset), *size,
TRUE, FTAG, &numbufs, &dbp, 0);
if (err)
return (err);

for (i = 0; i < numbufs; i++) {
uint64_t tocpy;
int64_t bufoff;
dmu_buf_t *db = dbp[i];
uint64_t done=0;

ASSERT(size > 0);

bufoff = (position+*offset) - db->db_offset;
tocpy = MIN(db->db_size - bufoff, *size);
#ifdef __APPLE__
done = zvolIO_kit_read(iomem,
*offset,
(char *)db->db_data + bufoff,
tocpy);
#endif
RtlMoveMemory((void *)((uintptr_t)iomem + *offset),
(void *)((uintptr_t)db->db_data + bufoff),
tocpy);
done = tocpy;

if (!done) {
err = EIO;
break;
}

*size -= done;
*offset += done;
}
dmu_buf_rele_array(dbp, numbufs, FTAG);

DB_DNODE_EXIT(db);

return (err);
}

static int
dmu_write_win_dnode(dnode_t *dn, uint64_t *offset, uint64_t position,
uint64_t *size, void *iomem, dmu_tx_t *tx)
{
dmu_buf_t **dbp;
int numbufs;
int err = 0;
int i;

//err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
// FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
err = dmu_buf_hold_array_by_dnode(dn, *offset+position, *size,
FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
if (err)
return (err);

while(*size > 0) {

for (i = 0; i < numbufs; i++) {
int tocpy;
int bufoff;
uint64_t done=0;
dmu_buf_t *db = dbp[i];

ASSERT(size > 0);

//bufoff = uio->uio_loffset - db->db_offset;
bufoff = (position + *offset) - db->db_offset;
tocpy = (int)MIN(db->db_size - bufoff, *size);

ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);

if (tocpy == db->db_size)
dmu_buf_will_fill(db, tx);
else
dmu_buf_will_dirty(db, tx);

/*
* XXX uiomove could block forever (eg.nfs-backed
* pages). There needs to be a uiolockdown() function
* to lock the pages in memory, so that uiomove won't
* block.
*/
/*
err = uiomove((char *)db->db_data + bufoff, tocpy,
UIO_WRITE, uio);
*/
RtlMoveMemory((void *)((uintptr_t)db->db_data + bufoff),
(void *)((uintptr_t)iomem + *offset),
tocpy);
done = tocpy;

if (tocpy == db->db_size)
dmu_buf_fill_done(db, tx);

if (done > 0) {
*offset += done;
*size -= done;
}
}
}

dmu_buf_rele_array(dbp, numbufs, FTAG);
return (err);
}

int
dmu_write_win_dbuf(dmu_buf_t *zdb, uint64_t *offset, uint64_t position,
uint64_t *size, void *iomem, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
dnode_t *dn;
int err;

if (size == 0)
return (0);

DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
err = dmu_write_win_dnode(dn, offset, position, size, iomem, tx);
DB_DNODE_EXIT(db);

return (err);
}
#endif /* _KERNEL */

/*
Expand Down
20 changes: 18 additions & 2 deletions ZFSin/zfs/module/zfs/zfs_windows_zvol_scsi.c
Original file line number Diff line number Diff line change
Expand Up @@ -726,16 +726,32 @@ wzvol_WkRtn(__in PVOID pWkParms) // Parm list pointer.
goto Done;
}

// Create an uio for the IO. If we can possibly embed
// the uio in some Extension to this IO, we could
// save the allocation here.
uio_t *uio = uio_create(1, 0, UIO_SYSSPACE,
ActionRead == pWkRtnParms->Action ? UIO_READ : UIO_WRITE);
if (uio == NULL) {
dprintf("%s: out of memory.\n", __func__);
status = SRB_STATUS_INVALID_REQUEST;
goto Done;
}
VERIFY0(uio_addiov(uio, (user_addr_t)pSrb->DataBuffer,
pSrb->DataTransferLength));
uio_setoffset(uio, sectorOffset);

/* Call ZFS to read/write data */
if (ActionRead == pWkRtnParms->Action) {
status = zvol_read_win(zv, sectorOffset, pSrb->DataTransferLength, pSrb->DataBuffer);
status = zvol_read(zv, uio);
} else {
status = zvol_write_win(zv, sectorOffset, pSrb->DataTransferLength, pSrb->DataBuffer);
status = zvol_write(zv, uio);
}

if (status == 0)
status = SRB_STATUS_SUCCESS;

uio_free(uio);

Done:
pSrb->SrbStatus = status;

Expand Down
Loading

0 comments on commit fb5cd5b

Please sign in to comment.