diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h index bdddcc366b8d..e8b701631362 100644 --- a/include/sys/zfs_znode.h +++ b/include/sys/zfs_znode.h @@ -354,7 +354,8 @@ extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp); extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, offset_t off, ssize_t len, int ioflag); + znode_t *zp, offset_t off, ssize_t len, int ioflag, + zil_callback_t callback, void *callback_data); extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, uint64_t off, uint64_t len); extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, diff --git a/include/sys/zil.h b/include/sys/zil.h index 589e28f83752..db6ee1103c4c 100644 --- a/include/sys/zil.h +++ b/include/sys/zil.h @@ -362,11 +362,15 @@ typedef enum { WR_NUM_STATES /* number of states */ } itx_wr_state_t; +typedef void (*zil_callback_t)(void *data); + typedef struct itx { list_node_t itx_node; /* linkage on zl_itx_list */ void *itx_private; /* type-specific opaque data */ itx_wr_state_t itx_wr_state; /* write state */ uint8_t itx_sync; /* synchronous transaction */ + zil_callback_t itx_callback; /* To be called when the write is on persistent storage */ + void *itx_callback_data; /* User data for the callback above */ uint64_t itx_sod; /* record size on disk */ uint64_t itx_oid; /* object id */ lr_t itx_lr; /* common part of log record */ diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index cbd6f1cb41a6..c948d6066b10 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -447,21 +447,27 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, } /* - * zfs_log_write() handles TX_WRITE transactions. + * zfs_log_write() handles TX_WRITE transactions. The specified callback is + * called as soon as the write is on stable storage (be it via a DMU sync or a + * ZIL commit). */ long zfs_immediate_write_sz = 32768; void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, - znode_t *zp, offset_t off, ssize_t resid, int ioflag) + znode_t *zp, offset_t off, ssize_t resid, int ioflag, + zil_callback_t callback, void *callback_data) { itx_wr_state_t write_state; boolean_t slogging; uintptr_t fsync_cnt; ssize_t immediate_write_sz; - if (zil_replaying(zilog, tx) || zp->z_unlinked) + if (zil_replaying(zilog, tx) || zp->z_unlinked) { + if (callback != NULL) + callback(callback_data); return; + } immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) ? 0 : (ssize_t)zfs_immediate_write_sz; @@ -518,6 +524,8 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, (fsync_cnt == 0)) itx->itx_sync = B_FALSE; + itx->itx_callback = callback; + itx->itx_callback_data = callback_data; zil_itx_assign(zilog, itx, tx); off += len; diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 876d44b3563d..519b41bf107f 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -893,7 +893,8 @@ zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); - zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag); + zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag, + NULL, NULL); dmu_tx_commit(tx); if (error != 0) @@ -3815,19 +3816,10 @@ zfs_link(struct inode *tdip, struct inode *sip, char *name, cred_t *cr) EXPORT_SYMBOL(zfs_link); static void -zfs_putpage_commit_cb(void *arg, int error) +zfs_putpage_commit_cb(void *arg) { struct page *pp = arg; - - if (error) { - __set_page_dirty_nobuffers(pp); - - if (error != ECANCELED) - SetPageError(pp); - } else { - ClearPageError(pp); - } - + ClearPageError(pp); end_page_writeback(pp); } @@ -3861,7 +3853,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int cnt = 0; - int sync; ZFS_ENTER(zsb); ZFS_VERIFY_ZP(zp); @@ -3902,11 +3893,6 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) rl = zfs_range_lock(zp, pgoff, pglen, RL_WRITER); tx = dmu_tx_create(zsb->z_os); - sync = ((zsb->z_os->os_sync == ZFS_SYNC_ALWAYS) || - (wbc->sync_mode == WB_SYNC_ALL)); - if (!sync) - dmu_tx_callback_register(tx, zfs_putpage_commit_cb, pp); - dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); @@ -3916,16 +3902,10 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) if (err == ERESTART) dmu_tx_wait(tx); - /* Will call all registered commit callbacks */ dmu_tx_abort(tx); - - /* - * For the synchronous case the commit callback must be - * explicitly called because there is no registered callback. - */ - if (sync) - zfs_putpage_commit_cb(pp, ECANCELED); - + __set_page_dirty_nobuffers(pp); + ClearPageError(pp); + end_page_writeback(pp); zfs_range_unlock(rl); ZFS_EXIT(zsb); return (err); @@ -3948,14 +3928,19 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); - zfs_log_write(zsb->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0); + zfs_log_write(zsb->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0, + zfs_putpage_commit_cb, pp); dmu_tx_commit(tx); zfs_range_unlock(rl); - if (sync) { + if (wbc->sync_mode != WB_SYNC_NONE) { + /* + * Note that this is rarely called under writepages(), because + * writepages() normally handles the entire commit for + * performance reasons. + */ zil_commit(zsb->z_log, zp->z_id); - zfs_putpage_commit_cb(pp, err); } ZFS_EXIT(zsb); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index c1796937b568..144f0bf4c980 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -1182,6 +1182,8 @@ zil_itx_create(uint64_t txtype, size_t lrsize) itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */ itx->itx_lr.lrc_seq = 0; /* defensive */ itx->itx_sync = B_TRUE; /* default is synchronous */ + itx->itx_callback = NULL; + itx->itx_callback_data = NULL; return (itx); } @@ -1207,6 +1209,8 @@ zil_itxg_clean(itxs_t *itxs) list = &itxs->i_sync_list; while ((itx = list_head(list)) != NULL) { + if (itx->itx_callback != NULL) + itx->itx_callback(itx->itx_callback_data); list_remove(list, itx); kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); @@ -1217,6 +1221,8 @@ zil_itxg_clean(itxs_t *itxs) while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { list = &ian->ia_list; while ((itx = list_head(list)) != NULL) { + if (itx->itx_callback != NULL) + itx->itx_callback(itx->itx_callback_data); list_remove(list, itx); kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); @@ -1283,6 +1289,8 @@ zil_remove_async(zilog_t *zilog, uint64_t oid) mutex_exit(&itxg->itxg_lock); } while ((itx = list_head(&clean_list)) != NULL) { + if (itx->itx_callback != NULL) + itx->itx_callback(itx->itx_callback_data); list_remove(&clean_list, itx); kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); @@ -1528,15 +1536,13 @@ zil_commit_writer(zilog_t *zilog) } DTRACE_PROBE1(zil__cw1, zilog_t *, zilog); - while ((itx = list_head(&zilog->zl_itx_commit_list))) { + for (itx = list_head(&zilog->zl_itx_commit_list); itx != NULL; + itx = list_next(&zilog->zl_itx_commit_list, itx)) { txg = itx->itx_lr.lrc_txg; ASSERT(txg); if (txg > spa_last_synced_txg(spa) || txg > spa_freeze_txg(spa)) lwb = zil_lwb_commit(zilog, itx, lwb); - list_remove(&zilog->zl_itx_commit_list, itx); - kmem_free(itx, offsetof(itx_t, itx_lr) - + itx->itx_lr.lrc_reclen); } DTRACE_PROBE1(zil__cw2, zilog_t *, zilog); @@ -1558,6 +1564,17 @@ zil_commit_writer(zilog_t *zilog) if (error || lwb == NULL) txg_wait_synced(zilog->zl_dmu_pool, 0); + while ((itx = list_head(&zilog->zl_itx_commit_list))) { + txg = itx->itx_lr.lrc_txg; + ASSERT(txg); + + if (itx->itx_callback != NULL) + itx->itx_callback(itx->itx_callback_data); + list_remove(&zilog->zl_itx_commit_list, itx); + kmem_free(itx, offsetof(itx_t, itx_lr) + + itx->itx_lr.lrc_reclen); + } + mutex_enter(&zilog->zl_lock); /* diff --git a/module/zfs/zpl_file.c b/module/zfs/zpl_file.c index 6598c177971d..2a4c697556d4 100644 --- a/module/zfs/zpl_file.c +++ b/module/zfs/zpl_file.c @@ -23,6 +23,7 @@ */ +#include #include #include #include @@ -412,7 +413,43 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data) static int zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) { - return write_cache_pages(mapping, wbc, zpl_putpage, mapping); + znode_t *zp = ITOZ(mapping->host); + zfs_sb_t *zsb = ITOZSB(mapping->host); + enum writeback_sync_modes sync_mode; + int result; + + ZFS_ENTER(zsb); + if (zsb->z_os->os_sync == ZFS_SYNC_ALWAYS) + wbc->sync_mode = WB_SYNC_ALL; + ZFS_EXIT(zsb); + sync_mode = wbc->sync_mode; + + /* + * We don't want to run write_cache_pages() in SYNC mode here, because + * that would make putpage() wait for a single page to be committed to + * disk every single time, resulting in atrocious performance. Instead + * we run it once in non-SYNC mode so that the ZIL gets all the data, + * and then we commit it all in one go. + */ + wbc->sync_mode = WB_SYNC_NONE; + result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); + if (sync_mode != wbc->sync_mode) { + ZFS_ENTER(zsb); + ZFS_VERIFY_ZP(zp); + zil_commit(zsb->z_log, zp->z_id); + ZFS_EXIT(zsb); + + /* + * We need to call write_cache_pages() again (we can't just + * return after the commit) because the previous call in + * non-SYNC mode does not guarantee that we got all the dirty + * pages (see the implementation of write_cache_pages() for + * details). That being said, this is a no-op in most cases. + */ + wbc->sync_mode = sync_mode; + result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); + } + return result; } /* @@ -424,6 +461,8 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) static int zpl_writepage(struct page *pp, struct writeback_control *wbc) { + if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS) + wbc->sync_mode = WB_SYNC_ALL; return zpl_putpage(pp, wbc, pp->mapping); }