diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h index c9db6d428ea2..9a34bafc1c77 100644 --- a/include/sys/zil_impl.h +++ b/include/sys/zil_impl.h @@ -182,7 +182,6 @@ typedef struct zil_vdev_node { } zil_vdev_node_t; #define ZIL_BURSTS 8 -#define ZIL_PREV_BLKS 16 /* * Stable storage intent log management structure. One per dataset. @@ -217,7 +216,9 @@ struct zilog { uint64_t zl_parse_lr_count; /* number of log records parsed */ itxg_t zl_itxg[TXG_SIZE]; /* intent log txg chains */ list_t zl_itx_commit_list; /* itx list to be committed */ - uint64_t zl_cur_used; /* current commit log size used */ + uint64_t zl_cur_size; /* current burst full size */ + uint64_t zl_cur_left; /* current burst remaining size */ + uint64_t zl_cur_max; /* biggest record in current burst */ list_t zl_lwb_list; /* in-flight log write list */ avl_tree_t zl_bp_tree; /* track bps during log parse */ clock_t zl_replay_time; /* lbolt of when replay started */ @@ -225,7 +226,8 @@ struct zilog { zil_header_t zl_old_header; /* debugging aid */ uint_t zl_parallel; /* workload is multi-threaded */ uint_t zl_prev_rotor; /* rotor for zl_prev[] */ - uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */ + uint_t zl_prev_opt[ZIL_BURSTS]; /* optimal block size */ + uint_t zl_prev_min[ZIL_BURSTS]; /* minimal first block size */ txg_node_t zl_dirty_link; /* protected by dp_dirty_zilogs list */ uint64_t zl_dirty_max_txg; /* highest txg used to dirty zilog */ diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 7ad0fb344b7b..9b5d866a8c22 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -144,6 +144,7 @@ static kmem_cache_t *zil_zcw_cache; static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx); static itx_t *zil_itx_clone(itx_t *oitx); +static uint64_t zil_max_waste_space(zilog_t *zilog); static int zil_bp_compare(const void *x1, const void *x2) @@ -1710,24 +1711,6 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) mutex_exit(&zilog->zl_lock); } -/* - * Define a limited set of intent log block sizes. - * - * These must be a multiple of 4KB. Note only the amount used (again - * aligned to 4KB) actually gets written. However, we can't always just - * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted. - */ -static const struct { - uint64_t limit; - uint64_t blksz; -} zil_block_buckets[] = { - { 4096, 4096 }, /* non TX_WRITE */ - { 8192 + 4096, 8192 + 4096 }, /* database */ - { 32768 + 4096, 32768 + 4096 }, /* NFS writes */ - { 65536 + 4096, 65536 + 4096 }, /* 64KB writes */ - { UINT64_MAX, SPA_OLD_MAXBLOCKSIZE}, /* > 128KB writes */ -}; - /* * Maximum block size used by the ZIL. This is picked up when the ZIL is * initialized. Otherwise this should not be used directly; see @@ -1735,6 +1718,91 @@ static const struct { */ static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE; +/* + * Plan splitting of the provided burst size between several blocks. + */ +static uint_t +zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize) +{ + uint_t md = zilog->zl_max_block_size - sizeof (zil_chain_t); + + if (size <= md) { + /* + * Small bursts are written as-is in one block. + */ + *minsize = size; + return (size); + } else if (size > 8 * md) { + /* + * Big bursts use maximum blocks. The first block size + * is hard to predict, but it does not really matter. + */ + *minsize = 0; + return (md); + } + + /* + * Medium bursts try to divide evenly to better utilize several SLOG + * VDEVs. The first block size we predict assuming the worst case of + * maxing out others. Fall back to using maximum blocks if due to + * large records or wasted space we can not predict anything better. + */ + uint_t s = size; + uint_t n = DIV_ROUND_UP(s, md - sizeof (lr_write_t)); + uint_t chunk = DIV_ROUND_UP(s, n); + uint_t waste = zil_max_waste_space(zilog); + waste = MAX(waste, zilog->zl_cur_max); + if (chunk <= md - waste) { + *minsize = MAX(s - (md - waste) * (n - 1), waste); + return (chunk); + } else { + *minsize = 0; + return (md); + } +} + +/* + * Try to predict next block size based on previous history. Make prediction + * sufficient for 7 of 8 previous bursts. Don't try to save if the saving is + * less then 50%, extra writes may cost more, but we don't want single spike + * to badly affect our predictions. + */ +static uint_t +zil_lwb_predict(zilog_t *zilog) +{ + uint_t m, o; + + /* If we are in the middle of a burst, take it into account also. */ + if (zilog->zl_cur_size > 0) { + o = zil_lwb_plan(zilog, zilog->zl_cur_size, &m); + } else { + o = UINT_MAX; + m = 0; + } + + /* Find minimum optimal size. We don't need to go below that. */ + for (int i = 0; i < ZIL_BURSTS; i++) + o = MIN(o, zilog->zl_prev_opt[i]); + + /* Find two biggest minimal first block sizes above the optimal. */ + uint_t m1 = MAX(m, o), m2 = o; + for (int i = 0; i < ZIL_BURSTS; i++) { + m = zilog->zl_prev_min[i]; + if (m >= m1) { + m2 = m1; + m1 = m; + } else if (m > m2) { + m2 = m; + } + } + + /* + * If second minimum size gives 50% saving -- use it. It may cost us + * one additional write later, but the space saving is just too big. + */ + return ((m1 < m2 * 2) ? m1 : m2); +} + /* * Close the log block for being issued and allocate the next one. * Has to be called under zl_issuer_lock to chain more lwbs. @@ -1742,7 +1810,7 @@ static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE; static lwb_t * zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state) { - int i; + uint64_t blksz, plan, plan2; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); @@ -1757,34 +1825,40 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state) return (NULL); /* - * Log blocks are pre-allocated. Here we select the size of the next - * block, based on size used in the last block. - * - first find the smallest bucket that will fit the block from a - * limited set of block sizes. This is because it's faster to write - * blocks allocated from the same metaslab as they are adjacent or - * close. - * - next find the maximum from the new suggested size and an array of - * previous sizes. This lessens a picket fence effect of wrongly - * guessing the size if we have a stream of say 2k, 64k, 2k, 64k - * requests. - * - * Note we only write what is used, but we can't just allocate - * the maximum block size because we can exhaust the available - * pool log space. + * Log blocks are pre-allocated. Here we select the size of the next + * block, based on what's left of this burst and the previous history. + * While we try to only write used part of the block, we can't just + * always allocate the maximum block size because we can exhaust all + * available pool log space, so we try to be reasonable. */ - uint64_t zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); - for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++) - continue; - zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size); - zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; - for (i = 0; i < ZIL_PREV_BLKS; i++) - zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); - DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, - uint64_t, zil_blksz, - uint64_t, zilog->zl_prev_blks[zilog->zl_prev_rotor]); - zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); - - return (zil_alloc_lwb(zilog, zil_blksz, NULL, 0, 0, state)); + if (zilog->zl_cur_left > 0) { + /* + * We are in the middle of a burst and know how much is left. + * But if workload is multi-threaded there may be more soon. + * Try to predict what can it be and plan for the worst case. + */ + uint_t m; + plan = zil_lwb_plan(zilog, zilog->zl_cur_left, &m); + if (zilog->zl_parallel) { + plan2 = zil_lwb_plan(zilog, zilog->zl_cur_left + + zil_lwb_predict(zilog), &m); + if (plan < plan2) + plan = plan2; + } + } else { + /* + * The previous burst is done and we can only predict what + * will come next. + */ + plan = zil_lwb_predict(zilog); + } + blksz = plan + sizeof (zil_chain_t); + blksz = P2ROUNDUP_TYPED(blksz, ZIL_MIN_BLKSZ, uint64_t); + blksz = MIN(blksz, zilog->zl_max_block_size); + DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, blksz, + uint64_t, plan); + + return (zil_alloc_lwb(zilog, blksz, NULL, 0, 0, state)); } /* @@ -1835,7 +1909,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) int wsz = lwb->lwb_sz; if (lwb->lwb_error == 0) { abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz); - if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) + if (!lwb->lwb_slog || zilog->zl_cur_size <= zil_slog_bulk) prio = ZIO_PRIORITY_SYNC_WRITE; else prio = ZIO_PRIORITY_ASYNC_WRITE; @@ -1996,6 +2070,42 @@ zil_max_copied_data(zilog_t *zilog) return (MIN(max_data, zil_maxcopied)); } +static uint64_t +zil_itx_record_size(itx_t *itx) +{ + lr_t *lr = &itx->itx_lr; + + if (lr->lrc_txtype == TX_COMMIT) + return (0); + ASSERT3U(lr->lrc_reclen, >=, sizeof (lr_t)); + return (lr->lrc_reclen); +} + +static uint64_t +zil_itx_data_size(itx_t *itx) +{ + lr_t *lr = &itx->itx_lr; + lr_write_t *lrw = (lr_write_t *)lr; + + if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { + ASSERT3U(lr->lrc_reclen, ==, sizeof (lr_write_t)); + return (P2ROUNDUP_TYPED(lrw->lr_length, sizeof (uint64_t), + uint64_t)); + } + return (0); +} + +static uint64_t +zil_itx_full_size(itx_t *itx) +{ + lr_t *lr = &itx->itx_lr; + + if (lr->lrc_txtype == TX_COMMIT) + return (0); + ASSERT3U(lr->lrc_reclen, >=, sizeof (lr_t)); + return (lr->lrc_reclen + zil_itx_data_size(itx)); +} + /* * Estimate space needed in the lwb for the itx. Allocate more lwbs or * split the itx as needed, but don't touch the actual transaction data. @@ -2038,16 +2148,9 @@ zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs) } reclen = lr->lrc_reclen; - if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { - ASSERT3U(reclen, ==, sizeof (lr_write_t)); - dlen = P2ROUNDUP_TYPED( - lrw->lr_length, sizeof (uint64_t), uint64_t); - } else { - ASSERT3U(reclen, >=, sizeof (lr_t)); - dlen = 0; - } + ASSERT3U(reclen, >=, sizeof (lr_t)); ASSERT3U(reclen, <=, zil_max_log_data(zilog, 0)); - zilog->zl_cur_used += (reclen + dlen); + dlen = zil_itx_data_size(itx); cont: /* @@ -2088,6 +2191,7 @@ zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs) clrw->lr_length = dnow; lrw->lr_offset += dnow; lrw->lr_length -= dnow; + zilog->zl_cur_left -= dnow; } else { citx = itx; clr = lr; @@ -2109,10 +2213,8 @@ zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs) list_insert_tail(&lwb->lwb_itxs, citx); dlen -= dnow; - if (dlen > 0) { - zilog->zl_cur_used += reclen; + if (dlen > 0) goto cont; - } if (lr->lrc_txtype == TX_WRITE && lr->lrc_txg > spa_freeze_txg(zilog->zl_spa)) @@ -2139,13 +2241,8 @@ zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx) if (lr->lrc_txtype == TX_COMMIT) return; - if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { - dlen = P2ROUNDUP_TYPED( - lrw->lr_length, sizeof (uint64_t), uint64_t); - } else { - dlen = 0; - } reclen = lr->lrc_reclen; + dlen = zil_itx_data_size(itx); ASSERT3U(reclen + dlen, <=, lwb->lwb_nused - lwb->lwb_nfilled); lr_buf = lwb->lwb_buf + lwb->lwb_nfilled; @@ -2576,6 +2673,7 @@ zil_get_commit_list(zilog_t *zilog) ASSERT(zilog_is_dirty_in_txg(zilog, txg) || spa_freeze_txg(zilog->zl_spa) != UINT64_MAX); list_t *sync_list = &itxg->itxg_itxs->i_sync_list; + itx_t *itx = NULL; if (unlikely(zilog->zl_suspend > 0)) { /* * ZIL was just suspended, but we lost the race. @@ -2585,10 +2683,20 @@ zil_get_commit_list(zilog_t *zilog) if (!list_is_empty(sync_list)) wtxg = MAX(wtxg, txg); } else { + itx = list_head(sync_list); list_move_tail(commit_list, sync_list); } mutex_exit(&itxg->itxg_lock); + + while (itx != NULL) { + uint64_t s = zil_itx_full_size(itx); + zilog->zl_cur_size += s; + zilog->zl_cur_left += s; + s = zil_itx_record_size(itx); + zilog->zl_cur_max = MAX(zilog->zl_cur_max, s); + itx = list_next(commit_list, itx); + } } return (wtxg); } @@ -2728,13 +2836,20 @@ static void zil_burst_done(zilog_t *zilog) { if (!list_is_empty(&zilog->zl_itx_commit_list) || - zilog->zl_cur_used == 0) + zilog->zl_cur_size == 0) return; if (zilog->zl_parallel) zilog->zl_parallel--; - zilog->zl_cur_used = 0; + uint_t r = (zilog->zl_prev_rotor + 1) & (ZIL_BURSTS - 1); + zilog->zl_prev_rotor = r; + zilog->zl_prev_opt[r] = zil_lwb_plan(zilog, zilog->zl_cur_size, + &zilog->zl_prev_min[r]); + + zilog->zl_cur_size = 0; + zilog->zl_cur_max = 0; + zilog->zl_cur_left = 0; } /* @@ -2867,6 +2982,8 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) * itx list to somebody else who care. */ zilog->zl_parallel = ZIL_BURSTS; + zilog->zl_cur_left -= + zil_itx_full_size(itx); break; } } else { @@ -2876,8 +2993,10 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) } list_insert_tail(&nolwb_itxs, itx); } + zilog->zl_cur_left -= zil_itx_full_size(itx); } else { ASSERT3S(lrc->lrc_txtype, !=, TX_COMMIT); + zilog->zl_cur_left -= zil_itx_full_size(itx); zil_itx_destroy(itx); } } @@ -2960,9 +3079,9 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) * of each individual itx. */ if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) { + zil_burst_done(zilog); list_insert_tail(ilwbs, lwb); lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW); - zil_burst_done(zilog); if (lwb == NULL) { while ((lwb = list_remove_head(ilwbs)) != NULL) zil_lwb_write_issue(zilog, lwb); @@ -3120,12 +3239,11 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * since we've reached the commit waiter's timeout and it still * hasn't been issued. */ + zil_burst_done(zilog); lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED); - zil_burst_done(zilog); - if (nlwb == NULL) { /* * When zil_lwb_write_close() returns NULL, this @@ -3720,7 +3838,9 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys) zilog->zl_dirty_max_txg = 0; zilog->zl_last_lwb_opened = NULL; zilog->zl_last_lwb_latency = 0; - zilog->zl_max_block_size = zil_maxblocksize; + zilog->zl_max_block_size = MIN(MAX(P2ALIGN_TYPED(zil_maxblocksize, + ZIL_MIN_BLKSZ, uint64_t), ZIL_MIN_BLKSZ), + spa_maxblocksize(dmu_objset_spa(os))); mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL); @@ -3740,6 +3860,11 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys) cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); cv_init(&zilog->zl_lwb_io_cv, NULL, CV_DEFAULT, NULL); + for (int i = 0; i < ZIL_BURSTS; i++) { + zilog->zl_prev_opt[i] = zilog->zl_max_block_size - + sizeof (zil_chain_t); + } + return (zilog); }