Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OpenZFS 7578 - Fix/improve some aspects of ZIL writing. #6191

Merged
merged 1 commit into from
Jun 9, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion cmd/ztest/ztest.c
Original file line number Diff line number Diff line change
Expand Up @@ -1620,7 +1620,6 @@ ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr)
itx->itx_private = zd;
itx->itx_wr_state = write_state;
itx->itx_sync = (ztest_random(8) == 0);
itx->itx_sod += (write_state == WR_NEED_COPY ? lr->lr_length : 0);

bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
sizeof (*lr) - sizeof (lr_t));
Expand Down
8 changes: 2 additions & 6 deletions include/sys/trace_zil.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ DECLARE_EVENT_CLASS(zfs_zil_class,
__field(uint64_t, zl_parse_lr_count)
__field(uint64_t, zl_next_batch)
__field(uint64_t, zl_com_batch)
__field(uint64_t, zl_itx_list_sz)
__field(uint64_t, zl_cur_used)
__field(clock_t, zl_replay_time)
__field(uint64_t, zl_replay_blks)
Expand All @@ -88,7 +87,6 @@ DECLARE_EVENT_CLASS(zfs_zil_class,
__entry->zl_parse_lr_count = zilog->zl_parse_lr_count;
__entry->zl_next_batch = zilog->zl_next_batch;
__entry->zl_com_batch = zilog->zl_com_batch;
__entry->zl_itx_list_sz = zilog->zl_itx_list_sz;
__entry->zl_cur_used = zilog->zl_cur_used;
__entry->zl_replay_time = zilog->zl_replay_time;
__entry->zl_replay_blks = zilog->zl_replay_blks;
Expand All @@ -98,17 +96,15 @@ DECLARE_EVENT_CLASS(zfs_zil_class,
"replay %u stop_sync %u writer %u logbias %u sync %u "
"parse_error %u parse_blk_seq %llu parse_lr_seq %llu "
"parse_blk_count %llu parse_lr_count %llu next_batch %llu "
"com_batch %llu itx_list_sz %llu cur_used %llu replay_time %lu "
"replay_blks %llu }",
"com_batch %llu cur_used %llu replay_time %lu replay_blks %llu }",
__entry->zl_lr_seq, __entry->zl_commit_lr_seq,
__entry->zl_destroy_txg, __entry->zl_replaying_seq,
__entry->zl_suspend, __entry->zl_suspending, __entry->zl_keep_first,
__entry->zl_replay, __entry->zl_stop_sync, __entry->zl_writer,
__entry->zl_logbias, __entry->zl_sync, __entry->zl_parse_error,
__entry->zl_parse_blk_seq, __entry->zl_parse_lr_seq,
__entry->zl_parse_blk_count, __entry->zl_parse_lr_count,
__entry->zl_next_batch, __entry->zl_com_batch,
__entry->zl_itx_list_sz, __entry->zl_cur_used,
__entry->zl_next_batch, __entry->zl_com_batch, __entry->zl_cur_used,
__entry->zl_replay_time, __entry->zl_replay_blks)
);
/* END CSTYLED */
Expand Down
1 change: 0 additions & 1 deletion include/sys/zil.h
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,6 @@ typedef struct itx {
uint8_t itx_sync; /* synchronous transaction */
zil_callback_t itx_callback; /* Called when the itx is persistent */
void *itx_callback_data; /* User data for the callback */
uint64_t itx_sod; /* record size on disk */
uint64_t itx_oid; /* object id */
lr_t itx_lr; /* common part of log record */
/* followed by type-specific part of lr_xx_t and its immediate data */
Expand Down
20 changes: 18 additions & 2 deletions include/sys/zil_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ typedef struct lwb {
zilog_t *lwb_zilog; /* back pointer to log struct */
blkptr_t lwb_blk; /* on disk address of this log blk */
boolean_t lwb_fastwrite; /* is blk marked for fastwrite? */
boolean_t lwb_slog; /* lwb_blk is on SLOG device */
int lwb_nused; /* # used bytes in buffer */
int lwb_sz; /* size of block and buffer */
char *lwb_buf; /* log write buffer */
Expand All @@ -62,7 +63,6 @@ typedef struct itxs {
typedef struct itxg {
kmutex_t itxg_lock; /* lock for this structure */
uint64_t itxg_txg; /* txg for this chain */
uint64_t itxg_sod; /* total size on disk for this txg */
itxs_t *itxg_itxs; /* sync and async itxs */
} itxg_t;

Expand Down Expand Up @@ -120,7 +120,6 @@ struct zilog {
kcondvar_t zl_cv_batch[2]; /* batch condition variables */
itxg_t zl_itxg[TXG_SIZE]; /* intent log txg chains */
list_t zl_itx_commit_list; /* itx list to be committed */
uint64_t zl_itx_list_sz; /* total size of records on list */
uint64_t zl_cur_used; /* current commit log size used */
list_t zl_lwb_list; /* in-flight log write list */
kmutex_t zl_vdev_lock; /* protects zl_vdev_tree */
Expand All @@ -140,9 +139,26 @@ typedef struct zil_bp_node {
avl_node_t zn_node;
} zil_bp_node_t;

/*
* Maximum amount of write data that can be put into single log block.
*/
#define ZIL_MAX_LOG_DATA (SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
sizeof (lr_write_t))

/*
* Maximum amount of log space we agree to waste to reduce number of
* WR_NEED_COPY chunks to reduce zl_get_data() overhead (~12%).
*/
#define ZIL_MAX_WASTE_SPACE (ZIL_MAX_LOG_DATA / 8)

/*
* Maximum amount of write data for WR_COPIED. Fall back to WR_NEED_COPY
* as more space efficient if we can't fit at least two log records into
* maximum sized log block.
*/
#define ZIL_MAX_COPIED_DATA ((SPA_OLD_MAXBLOCKSIZE - \
sizeof (zil_chain_t)) / 2 - sizeof (lr_write_t))

#ifdef __cplusplus
}
#endif
Expand Down
2 changes: 1 addition & 1 deletion include/sys/zio.h
Original file line number Diff line number Diff line change
Expand Up @@ -515,7 +515,7 @@ extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
const blkptr_t *bp, enum zio_flag flags);

extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp,
uint64_t size, boolean_t use_slog);
uint64_t size, boolean_t *slog);
extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
extern void zio_flush(zio_t *zio, vdev_t *vd);
extern void zio_shrink(zio_t *zio, uint64_t size);
Expand Down
8 changes: 5 additions & 3 deletions man/man5/zfs-module-parameters.5
Original file line number Diff line number Diff line change
Expand Up @@ -1935,12 +1935,14 @@ Use \fB1\fR for yes and \fB0\fR for no (default).
.sp
.ne 2
.na
\fBzil_slog_limit\fR (ulong)
\fBzil_slog_bulk\fR (ulong)
.ad
.RS 12n
Max commit bytes to separate log device
Limit SLOG write size per commit executed with synchronous priority.
Any writes above that will be executed with lower (asynchronous) priority
to limit potential SLOG device abuse by single active ZIL writer.
.sp
Default value: \fB1,048,576\fR.
Default value: \fB786,432\fR.
.RE

.sp
Expand Down
37 changes: 15 additions & 22 deletions module/zfs/zfs_log.c
Original file line number Diff line number Diff line change
Expand Up @@ -487,10 +487,9 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
znode_t *zp, offset_t off, ssize_t resid, int ioflag,
zil_callback_t callback, void *callback_data)
{
uint32_t blocksize = zp->z_blksz;
itx_wr_state_t write_state;
boolean_t slogging;
uintptr_t fsync_cnt;
ssize_t immediate_write_sz;

if (zil_replaying(zilog, tx) || zp->z_unlinked ||
zfs_xattr_owner_unlinked(zp)) {
Expand All @@ -499,12 +498,10 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
return;
}

immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
? 0 : (ssize_t)zfs_immediate_write_sz;

slogging = spa_has_slogs(zilog->zl_spa) &&
(zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
if (resid > immediate_write_sz && !slogging && resid <= zp->z_blksz)
if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
write_state = WR_INDIRECT;
else if (!spa_has_slogs(zilog->zl_spa) &&
resid >= zfs_immediate_write_sz)
write_state = WR_INDIRECT;
else if (ioflag & (FSYNC | FDSYNC))
write_state = WR_COPIED;
Expand All @@ -518,30 +515,26 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
while (resid) {
itx_t *itx;
lr_write_t *lr;
ssize_t len;
itx_wr_state_t wr_state = write_state;
ssize_t len = resid;

/*
* If the write would overflow the largest block then split it.
*/
if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA)
len = SPA_OLD_MAXBLOCKSIZE >> 1;
else
len = resid;
if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
wr_state = WR_NEED_COPY;
else if (wr_state == WR_INDIRECT)
len = MIN(blocksize - P2PHASE(off, blocksize), resid);

itx = zil_itx_create(txtype, sizeof (*lr) +
(write_state == WR_COPIED ? len : 0));
(wr_state == WR_COPIED ? len : 0));
lr = (lr_write_t *)&itx->itx_lr;
if (write_state == WR_COPIED && dmu_read(ZTOZSB(zp)->z_os,
if (wr_state == WR_COPIED && dmu_read(ZTOZSB(zp)->z_os,
zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
zil_itx_destroy(itx);
itx = zil_itx_create(txtype, sizeof (*lr));
lr = (lr_write_t *)&itx->itx_lr;
write_state = WR_NEED_COPY;
wr_state = WR_NEED_COPY;
}

itx->itx_wr_state = write_state;
if (write_state == WR_NEED_COPY)
itx->itx_sod += len;
itx->itx_wr_state = wr_state;
lr->lr_foid = zp->z_id;
lr->lr_offset = off;
lr->lr_length = len;
Expand Down
Loading