-
Notifications
You must be signed in to change notification settings - Fork 1.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Introduce write throttle smoothing #12868
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,7 +15,7 @@ | |
.\" own identifying information: | ||
.\" Portions Copyright [yyyy] [name of copyright owner] | ||
.\" | ||
.Dd June 1, 2021 | ||
.Dd January 8, 2021 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like unnecessary change. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hym, but when we change the man page shouldn't we bump the .Dd? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, this is fine. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, should be more specific - January 8, 2021 is older than June 1, 2021. Or maybe it's a typo and it should be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep it should be 2022. Thanks! |
||
.Dt ZFS 4 | ||
.Os | ||
. | ||
|
@@ -944,6 +944,21 @@ This will smoothly handle between ten times and a tenth of this number. | |
.Pp | ||
.Sy zfs_delay_scale No \(mu Sy zfs_dirty_data_max Em must No be smaller than Sy 2^64 . | ||
. | ||
.It Sy zfs_smoothing_write Ns = Ns Sy 0 Ns s Pq ulong | ||
This controls for how many seconds smoothing should be applied. | ||
The smoothing mechanism is used to add additional transaction delays | ||
after the amount of dirty data drops below | ||
.Sy zfs_delay_min_dirty_percent . | ||
This mechanism may be used to avoid stalls and uneven performance during | ||
heavy write workloads | ||
. | ||
.It Sy zfs_smoothing_scale Ns = Ns Sy 100000 Pq int | ||
Similar to | ||
.Sy zfs_delay_scale , | ||
but for write smoothing. | ||
This variable controls the scale of smoothing curve. | ||
Larger values cause longer delays for a given amount of dirty data. | ||
. | ||
.It Sy zfs_disable_ivset_guid_check Ns = Ns Sy 0 Ns | Ns 1 Pq int | ||
Disables requirement for IVset GUIDs to be present and match when doing a raw | ||
receive of encrypted datasets. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -776,14 +776,16 @@ static const hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */ | |
* of zfs_delay_scale to increase the steepness of the curve. | ||
*/ | ||
static void | ||
dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) | ||
dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty, hrtime_t last_smooth) | ||
{ | ||
dsl_pool_t *dp = tx->tx_pool; | ||
uint64_t delay_min_bytes = | ||
zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; | ||
hrtime_t wakeup, min_tx_time, now; | ||
hrtime_t wakeup, min_tx_time, now, smoothing_time, delay_time; | ||
|
||
if (dirty <= delay_min_bytes) | ||
now = gethrtime(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Additional gethrtime() per I/O may be not huge, but not free also. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This one actually isn't per IO - this one is in one that we "almost know" that we have to delay. |
||
|
||
if (dirty <= delay_min_bytes && last_smooth <= now) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can avoid this early exit if we can take the "performance hit" of the two zero initializations in line 799 and 800, and the MIN(MAX(...)) code in line 811. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doesn't it make code more readable? |
||
return; | ||
|
||
/* | ||
|
@@ -794,11 +796,20 @@ dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) | |
*/ | ||
ASSERT3U(dirty, <, zfs_dirty_data_max); | ||
|
||
now = gethrtime(); | ||
min_tx_time = zfs_delay_scale * | ||
(dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); | ||
min_tx_time = MIN(min_tx_time, zfs_delay_max_ns); | ||
if (now > tx->tx_start + min_tx_time) | ||
smoothing_time = 0; | ||
delay_time = 0; | ||
|
||
if (dirty > delay_min_bytes) { | ||
delay_time = zfs_delay_scale * | ||
(dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should probably use the occasion to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I can try that. |
||
} | ||
if (last_smooth > now) { | ||
smoothing_time = zfs_smoothing_scale * dirty / | ||
(zfs_dirty_data_max - dirty); | ||
} | ||
|
||
min_tx_time = MIN(MAX(smoothing_time, delay_time), zfs_delay_max_ns); | ||
if (zfs_smoothing_write == 0 && now > tx->tx_start + min_tx_time) | ||
return; | ||
|
||
DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, | ||
|
@@ -808,6 +819,9 @@ dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) | |
wakeup = MAX(tx->tx_start + min_tx_time, | ||
dp->dp_last_wakeup + min_tx_time); | ||
dp->dp_last_wakeup = wakeup; | ||
if (dirty > delay_min_bytes) { | ||
dp->dp_last_smooth = now + zfs_smoothing_write * NANOSEC; | ||
} | ||
mutex_exit(&dp->dp_lock); | ||
|
||
zfs_sleep_until(wakeup); | ||
|
@@ -1069,7 +1083,7 @@ dmu_tx_wait(dmu_tx_t *tx) | |
{ | ||
spa_t *spa = tx->tx_pool->dp_spa; | ||
dsl_pool_t *dp = tx->tx_pool; | ||
hrtime_t before; | ||
hrtime_t before, last_smooth; | ||
|
||
ASSERT(tx->tx_txg == 0); | ||
ASSERT(!dsl_pool_config_held(tx->tx_pool)); | ||
|
@@ -1089,10 +1103,11 @@ dmu_tx_wait(dmu_tx_t *tx) | |
DMU_TX_STAT_BUMP(dmu_tx_dirty_over_max); | ||
while (dp->dp_dirty_total >= zfs_dirty_data_max) | ||
cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); | ||
last_smooth = dp->dp_last_smooth; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is the read of Is it because we need to hold the mutex, and don't want to pay the cost of re-acquiring in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, its because we would need to take again a mutex. |
||
dirty = dp->dp_dirty_total; | ||
mutex_exit(&dp->dp_lock); | ||
|
||
dmu_tx_delay(tx, dirty); | ||
dmu_tx_delay(tx, dirty, last_smooth); | ||
|
||
tx->tx_wait_dirty = B_FALSE; | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -103,6 +103,7 @@ unsigned long zfs_dirty_data_max = 0; | |
unsigned long zfs_dirty_data_max_max = 0; | ||
int zfs_dirty_data_max_percent = 10; | ||
int zfs_dirty_data_max_max_percent = 25; | ||
unsigned long zfs_smoothing_write = 0; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't like when users are required to tune things for getting good results. If this patch is so good, why is it disabled by default? And as I've written in my comment, it would be good to understand the actual cause of fluctuation. Why added write accounting did not make it smooth enough? Unavoidable fluctuations due to metadata updates and cache flushes during commit? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Alexander, thank you for raising your concerns. In general, the existing mechanism should be enough in most cases. As you mention, the fluctuation with txg commits has been known for a long time, and this is a small This patch is non-ideal - we agree. But we try to address this issue somehow and give at least some mechanism to test. Why is this disabled by default? We successfully tested it on our installation, but we are unsure how all configurations will react. We prefer to start small and see how the broader community will respond to this approach. We are more than happy to enable it by default at some point. On the other hand, if there will be a better approach or it seems unpracticed on most installations, we are more than happy to remove it. Fortunately, the mechanism isn't very intrusive. It is not an on-disk format change, so if we decide to adapt it, or replace it with another mechanism in the future, we won't have any problems doing that. If you have another idea of dealing with fluctuation, we are more than happy to try it. |
||
|
||
/* | ||
* zfs_wrlog_data_max, the upper limit of TX_WRITE log data. | ||
|
@@ -140,6 +141,7 @@ int zfs_delay_min_dirty_percent = 60; | |
* multiply in dmu_tx_delay(). | ||
*/ | ||
unsigned long zfs_delay_scale = 1000 * 1000 * 1000 / 2000; | ||
unsigned long zfs_smoothing_scale = 100000; | ||
|
||
/* | ||
* This determines the number of threads used by the dp_sync_taskq. | ||
|
@@ -958,9 +960,10 @@ dsl_pool_need_dirty_delay(dsl_pool_t *dp) | |
|
||
mutex_enter(&dp->dp_lock); | ||
uint64_t dirty = dp->dp_dirty_total; | ||
hrtime_t last_delay = dp->dp_last_smooth; | ||
mutex_exit(&dp->dp_lock); | ||
|
||
return (dirty > delay_min_bytes); | ||
return (dirty > delay_min_bytes || last_delay > gethrtime()); | ||
} | ||
|
||
static boolean_t | ||
|
@@ -1462,6 +1465,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, ULONG, ZMOD_RW, | |
ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, ULONG, ZMOD_RW, | ||
"The size limit of write-transaction zil log data"); | ||
|
||
ZFS_MODULE_PARAM(zfs, zfs_, smoothing_write, ULONG, ZMOD_RW, | ||
"How long should we smooth write after last delay (sec)"); | ||
|
||
/* zfs_dirty_data_max_max only applied at module load in arc_init(). */ | ||
ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, ULONG, ZMOD_RD, | ||
"zfs_dirty_data_max upper bound in bytes"); | ||
|
@@ -1472,6 +1478,9 @@ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, INT, ZMOD_RW, | |
ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, ULONG, ZMOD_RW, | ||
"How quickly delay approaches infinity"); | ||
|
||
ZFS_MODULE_PARAM(zfs, zfs_, smoothing_scale, ULONG, ZMOD_RW, | ||
"Delay smoothing scale"); | ||
|
||
ZFS_MODULE_PARAM(zfs, zfs_, sync_taskq_batch_pct, INT, ZMOD_RW, | ||
"Max percent of CPUs that are used to sync dirty data"); | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
doc comment suggestion, if I understand the code correctly:
If that interpretation of the variable's role is correct, I'd prefer a different name.
For example,
dp_stop_write_smooth_after
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This isn't a variable described in doc.
This is variable to store a time of last smoothing.